From 286f5bae7ac2b4f98c9b344e435bb06c229633ac Mon Sep 17 00:00:00 2001 From: lhy <442488254@qq.com> Date: Tue, 11 Jun 2024 20:19:12 +0800 Subject: [PATCH] try implement improved buildATen on all ops. --- impl/torch/functions/functions.cpp | 1308 +++++++++++++++++----------- impl/torch/helper.cpp | 28 + impl/torch/helper.hpp | 29 +- 3 files changed, 847 insertions(+), 518 deletions(-) diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp index beac2dc9da..3175ebae65 100644 --- a/impl/torch/functions/functions.cpp +++ b/impl/torch/functions/functions.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #ifdef USE_HIP @@ -19,6 +20,10 @@ #define FLT_MIN __FLT_MIN__ +#define CALL_ATEN_FUNC(func, ...) at::func(__VA_ARGS__) + +#define CALL_ATEN_CUDA_FUNC(func, ...) at::cuda::func(__VA_ARGS__) + #include "../helper.hpp" #include "../vision_kernel.h" @@ -57,35 +62,38 @@ const char* diopiGetImplVersion() { } diopiError_t diopiRelu(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atInput = impl::aten::buildATen(input); - at::relu_out(atOut, atInput); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(relu_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiReluInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::relu_(atInput); return diopiSuccess; } diopiError_t diopiLeakyRelu(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* negative_slope) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atSlope = impl::aten::buildAtScalar(negative_slope); - at::leaky_relu_out(atOut, atInput, atSlope); + CALL_ATEN_CUDA_FUNC(leaky_relu_out, atOut, atInput, atSlope); return diopiSuccess; } diopiError_t diopiLeakyReluInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* negative_slope) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::Scalar atSlope = impl::aten::buildAtScalar(negative_slope); at::leaky_relu_(atInput, atSlope); @@ -95,7 +103,7 @@ diopiError_t diopiLeakyReluInp(diopiContextHandle_t ctx, diopiTensorHandle_t inp diopiError_t diopiMaxPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); @@ -108,6 +116,7 @@ diopiError_t diopiMaxPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); @@ -117,7 +126,7 @@ diopiError_t diopiMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTensorHand at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atIndices = impl::aten::buildATen(indices); bool atCeilMode = ceil_mode; - at::max_pool2d_with_indices_out(atOut, atIndices, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); + CALL_ATEN_CUDA_FUNC(max_pool2d_with_indices_out, atOut, atIndices, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); return diopiSuccess; } @@ -128,12 +137,13 @@ diopiError_t diopiMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTensorHand */ diopiError_t diopiDiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other, diopiRoundMode_t rounding_mode) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); auto roundingMode = impl::aten::getRoundingMode(rounding_mode); - at::div_out(atOut, atInput, atOther, roundingMode); + CALL_ATEN_CUDA_FUNC(div_out, atOut, atInput, atOther, roundingMode); return diopiSuccess; } @@ -143,11 +153,12 @@ diopiError_t diopiDiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo * @param rounding_mode supported in pytorch>=1.8 */ diopiError_t diopiDivInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other, diopiRoundMode_t rounding_mode) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); auto roundingMode = impl::aten::getRoundingMode(rounding_mode); - atInput.div_(atOther, roundingMode); + CALL_ATEN_CUDA_FUNC(div_, atInput, atOther, roundingMode); return diopiSuccess; } @@ -158,12 +169,13 @@ diopiError_t diopiDivInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, di */ diopiError_t diopiDivScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other, diopiRoundMode_t rounding_mode) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildAtScalar(other); auto roundingMode = impl::aten::getRoundingMode(rounding_mode); auto atOut = impl::aten::buildATen(out); - at::div_out(atOut, atInput, c10::scalar_to_tensor(atOther), roundingMode); + CALL_ATEN_CUDA_FUNC(div_out, atOut, atInput, c10::scalar_to_tensor(atOther), roundingMode); return diopiSuccess; } @@ -173,11 +185,12 @@ diopiError_t diopiDivScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, d * @param rounding_mode supported in pytorch>=1.8.0 */ diopiError_t diopiDivInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other, diopiRoundMode_t rounding_mode) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildAtScalar(other); auto roundingMode = impl::aten::getRoundingMode(rounding_mode); - atInput.div_(atOther, roundingMode); + CALL_ATEN_CUDA_FUNC(div_, atInput, c10::scalar_to_tensor(atOther), roundingMode); return diopiSuccess; } @@ -185,24 +198,26 @@ diopiError_t diopiDivInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp diopiError_t diopiConvolution2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, int64_t groups) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); - auto atWeight = impl::aten::buildATen(weight); - auto atBias = impl::aten::buildATen(bias); + auto atInput = impl::aten::buildATenSlow(input); + auto atWeight = impl::aten::buildATenSlow(weight); + auto atBias = impl::aten::buildATenSlow(bias); auto atStride = impl::aten::buildAtIntArray(stride); auto atPadding = impl::aten::buildAtIntArray(padding); auto atDilation = impl::aten::buildAtIntArray(dilation); - auto atOut = impl::aten::buildATen(out); + auto atOut = impl::aten::buildATenSlow(out); if (torch::cuda::cudnn_is_available()) { DIOPI_CHECK(atInput.options().type_equal(atWeight.options()), "Input type and weight type should be the same"); DIOPI_CHECK(!atBias.defined() || (atInput.options().type_equal(atBias.options())), "Input type and bias type should be the same"); - at::cudnn_convolution_out(atOut, atInput, atWeight, atPadding, atStride, atDilation, groups, false, false, true); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(cudnn_convolution_out, atOut, atInput, atWeight, atPadding, atStride, atDilation, groups, false, false, true); if (atBias.defined()) { std::vector shape(atInput.dim(), 1); shape[1] = -1; atOut.add_(atBias.reshape(shape)); } } else { - at::convolution_out(atOut, atInput, atWeight, atBias, atStride, atPadding, atDilation, false, at::IntArrayRef(0), groups); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(convolution_out, atOut, atInput, atWeight, atBias, atStride, atPadding, atDilation, false, at::IntArrayRef(0), groups); } return diopiSuccess; @@ -217,9 +232,9 @@ diopiError_t diopiConvolution2d(diopiContextHandle_t ctx, diopiTensorHandle_t ou diopiError_t diopiCrossEntropyLoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiReduction_t reduction, int64_t ignore_index, double label_smoothing) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); - auto atTarget = impl::aten::buildATen(target); - auto atWeight = impl::aten::buildATen(weight); + auto atInput = impl::aten::buildATenSlow(input); + auto atTarget = impl::aten::buildATenSlow(target); + auto atWeight = impl::aten::buildATenSlow(weight); #if TORCH_MM_VERSION >= TORCH_1_10_MM_VERSION auto atOut = at::cross_entropy_loss(atInput, atTarget, atWeight, reduction, ignore_index, label_smoothing); impl::aten::updateATen2Tensor(ctx, atOut, out); @@ -235,96 +250,105 @@ diopiError_t diopiCrossEntropyLoss(diopiContextHandle_t ctx, diopiTensorHandle_t } diopiError_t diopiBmm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t mat2) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMat2 = impl::aten::buildATen(mat2); auto atOut = impl::aten::buildATen(out); - at::bmm_out(atOut, atInput, atMat2); + CALL_ATEN_CUDA_FUNC(bmm_out, atOut, atInput, atMat2); return diopiSuccess; } diopiError_t diopiBaddbmm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t batch1, diopiConstTensorHandle_t batch2, double beta, double alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); auto atBatch1 = impl::aten::buildATen(batch1); auto atBatch2 = impl::aten::buildATen(batch2); - at::baddbmm_out(atOut, atInput, atBatch1, atBatch2, beta, alpha); + CALL_ATEN_CUDA_FUNC(baddbmm_out, atOut, atInput, atBatch1, atBatch2, beta, alpha); return diopiSuccess; } diopiError_t diopiBaddbmmInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t batch1, diopiConstTensorHandle_t batch2, double beta, double alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atBatch1 = impl::aten::buildATen(batch1); auto atBatch2 = impl::aten::buildATen(batch2); - atInput.baddbmm_(atBatch1, atBatch2, beta, alpha); + CALL_ATEN_CUDA_FUNC(baddbmm_, atInput, atBatch1, atBatch2, beta, alpha); return diopiSuccess; } diopiError_t diopiAddcmul(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t tensor1, diopiConstTensorHandle_t tensor2, const diopiScalar_t* value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); auto atTensor1 = impl::aten::buildATen(tensor1); auto atTensor2 = impl::aten::buildATen(tensor2); auto atValue = impl::aten::buildAtScalar(value); - at::addcmul_out(atOut, atInput, atTensor1, atTensor2, atValue); + CALL_ATEN_CUDA_FUNC(addcmul_out, atOut, atInput, atTensor1, atTensor2, atValue); return diopiSuccess; } diopiError_t diopiAddcmulInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t tensor1, diopiConstTensorHandle_t tensor2, const diopiScalar_t* value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atTensor1 = impl::aten::buildATen(tensor1); auto atTensor2 = impl::aten::buildATen(tensor2); auto atValue = impl::aten::buildAtScalar(value); - atInput.addcmul_(atTensor1, atTensor2, atValue); + CALL_ATEN_CUDA_FUNC(addcmul_, atInput, atTensor1, atTensor2, atValue); return diopiSuccess; } diopiError_t diopiMatmul(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); // Note(huqingqing): pytorch optimize the bmm case by folding the batch into the first dimension. // It changes the shape of output and causes warnning when using matmul_out. - at::matmul_out(atOut, atInput, atOther); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(matmul_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiAddcdiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t tensor1, diopiConstTensorHandle_t tensor2, const diopiScalar_t* value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atTensor1 = impl::aten::buildATen(tensor1); auto atTensor2 = impl::aten::buildATen(tensor2); auto atValue = impl::aten::buildAtScalar(value); auto atOut = impl::aten::buildATen(out); - at::addcdiv_out(atOut, atInput, atTensor1, atTensor2, atValue); + CALL_ATEN_CUDA_FUNC(addcdiv_out, atOut, atInput, atTensor1, atTensor2, atValue); return diopiSuccess; } diopiError_t diopiAddcdivInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t tensor1, diopiConstTensorHandle_t tensor2, const diopiScalar_t* value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atTensor1 = impl::aten::buildATen(tensor1); auto atTensor2 = impl::aten::buildATen(tensor2); auto atValue = impl::aten::buildAtScalar(value); - atInput.addcdiv_(atTensor1, atTensor2, atValue); + CALL_ATEN_CUDA_FUNC(addcdiv_, atInput, atTensor1, atTensor2, atValue); return diopiSuccess; } @@ -332,6 +356,7 @@ diopiError_t diopiAddcdivInp(diopiContextHandle_t ctx, diopiTensorHandle_t input // CAFFE2_API Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1); diopiError_t diopiAddmm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t mat1, diopiConstTensorHandle_t mat2, const diopiScalar_t* beta, const diopiScalar_t* alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMax1 = impl::aten::buildATen(mat1); @@ -339,28 +364,32 @@ diopiError_t diopiAddmm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopi auto atBeta = impl::aten::buildAtScalar(beta); auto atAlpha = impl::aten::buildAtScalar(alpha); auto atOut = impl::aten::buildATen(out); - at::addmm_out(atOut, atInput, atMax1, atMax2, atBeta, atAlpha); + CALL_ATEN_CUDA_FUNC(addmm_out, atOut, atInput, atMax1, atMax2, atBeta, atAlpha); return diopiSuccess; } // NOTE(fengsibo): add int, short, bool test case diopiError_t diopiMean(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); auto atDim = impl::aten::buildAtIntArray(dim); + std::cout << "atInput: " << " device: " << atInput.device() << std::endl; + std::cout << "atOut: " << " device: " << atOut.device() << std::endl; bool keepdim = false; if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::mean_out(atOut, atInput, atDim, keepdim); // TODO(fengsibo): use default type instead + CALL_ATEN_CUDA_FUNC(mean_out, atOut, atInput, atDim, keepdim); // TODO(fengsibo): use default type instead return diopiSuccess; } // NOTE(fengsibo): add int, short, bool test case diopiError_t diopiSum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); @@ -370,12 +399,13 @@ diopiError_t diopiSum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::sum_out(atOut, atInput, atDim, keepdim); + CALL_ATEN_CUDA_FUNC(sum_out, atOut, atInput, atDim, keepdim); return diopiSuccess; } diopiError_t diopiStd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t dim, bool unbiased) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); @@ -384,12 +414,14 @@ diopiError_t diopiStd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::std_out(atOut, atInput, atDim, unbiased, keepdim); + //Note can not compile on pytorch 2.1 + CALL_ATEN_CUDA_FUNC(std_out, atOut, atInput, atDim, unbiased, keepdim); return diopiSuccess; } diopiError_t diopiMin(diopiContextHandle_t ctx, diopiTensorHandle_t min, diopiTensorHandle_t min_indices, diopiConstTensorHandle_t input, int64_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(min); @@ -398,21 +430,22 @@ diopiError_t diopiMin(diopiContextHandle_t ctx, diopiTensorHandle_t min, diopiTe if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::min_out(atOut, atIndices, atInput, dim, keepdim); + CALL_ATEN_CUDA_FUNC(min_out, atOut, atIndices, atInput, dim, keepdim); return diopiSuccess; } diopiError_t diopiMinAll(diopiContextHandle_t ctx, diopiTensorHandle_t min, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); - auto atOut = impl::aten::buildATen(min); + auto atInput = impl::aten::buildATenSlow(input); + auto atOut = impl::aten::buildATenSlow(min); impl::aten::invokeATenFuncRet(ctx, at::min, min, atInput); return diopiSuccess; } diopiError_t diopiMax(diopiContextHandle_t ctx, diopiTensorHandle_t max, diopiTensorHandle_t max_indices, diopiConstTensorHandle_t input, int64_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(max); @@ -421,20 +454,21 @@ diopiError_t diopiMax(diopiContextHandle_t ctx, diopiTensorHandle_t max, diopiTe if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::max_out(atOut, atIndices, atInput, dim, keepdim); + CALL_ATEN_CUDA_FUNC(max_out, atOut, atIndices, atInput, dim, keepdim); return diopiSuccess; } diopiError_t diopiMaxAll(diopiContextHandle_t ctx, diopiTensorHandle_t max, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); impl::aten::invokeATenFuncRet(ctx, at::max, max, atInput); return diopiSuccess; } diopiError_t diopiAny(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const int64_t* dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); @@ -443,15 +477,16 @@ diopiError_t diopiAny(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo keepdim = true; } if (dim == nullptr) { - at::any_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(any_out, atOut, atInput); } else { - at::any_out(atOut, atInput, *dim, keepdim); + CALL_ATEN_CUDA_FUNC(any_out, atOut, atInput, *dim, keepdim); } return diopiSuccess; } diopiError_t diopiAll(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const int64_t* dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); @@ -460,45 +495,53 @@ diopiError_t diopiAll(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo keepdim = true; } if (dim == nullptr) { - at::all_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(all_out, atOut, atInput); } else { - at::all_out(atOut, atInput, *dim, keepdim); + CALL_ATEN_CUDA_FUNC(all_out, atOut, atInput, *dim, keepdim); } return diopiSuccess; } diopiError_t diopiSoftmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::softmax_out(atOut, atInput, dim); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(softmax_out, atOut, atInput, dim); return diopiSuccess; } diopiError_t diopiLogSoftmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::log_softmax_out(atOut, atInput, dim); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(log_softmax_out, atOut, atInput, dim); return diopiSuccess; } diopiError_t diopiIndexSelect(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim, diopiConstTensorHandle_t index) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atOut = impl::aten::buildATen(out); - at::index_select_out(atOut, atInput, dim, atIndex); + std::cout << "input: " << "is view: "<< atInput.is_view() << " device: " << atInput.device() << " type: " << typeid(static_cast(atInput.unsafeGetTensorImpl())).name()<< std::endl; + std::cout << "other: " << "is view: "<< atIndex.is_view() << " device: " << atIndex.device() << " type: " << typeid(static_cast(atIndex.unsafeGetTensorImpl())).name()<< std::endl; + std::cout << "out: " << "is view: "<< atOut.is_view() << " device: " << atOut.device() << " type: " << typeid(atOut.unsafeGetTensorImpl()).name()<< std::endl; + CALL_ATEN_CUDA_FUNC(index_select_out, atOut, atInput, dim, atIndex); return diopiSuccess; } diopiError_t diopiSelect(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim, int64_t index) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); at::Tensor atOut = at::select(atInput, dim, index).contiguous(); impl::aten::updateATen2Tensor(ctx, atOut, out); @@ -507,12 +550,14 @@ diopiError_t diopiSelect(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiMaskedScatter(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t mask, diopiConstTensorHandle_t source) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMask = impl::aten::buildATen(mask); auto atSource = impl::aten::buildATen(source); auto atOut = impl::aten::buildATen(out); - at::masked_scatter_out(atOut, atInput, atMask, atSource); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(masked_scatter_out, atOut, atInput, atMask, atSource); return diopiSuccess; } @@ -520,8 +565,8 @@ diopiError_t diopiMaskedScatter(diopiContextHandle_t ctx, diopiTensorHandle_t ou diopiError_t diopiNms(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t dets, diopiConstTensorHandle_t scores, double iouThreshold) { impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(out); - auto atDets = impl::aten::buildATen(dets); - auto atScores = impl::aten::buildATen(scores); + auto atDets = impl::aten::buildATenSlow(dets); + auto atScores = impl::aten::buildATenSlow(scores); auto atOut = vision::ops::nms_kernel(atDets, atScores, iouThreshold); impl::aten::buildDiopiTensor(ctx, atOut, out); @@ -531,7 +576,7 @@ diopiError_t diopiNms(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiC diopiError_t diopiNonzero(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(out); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); auto atOut = at::nonzero(atInput); impl::aten::buildDiopiTensor(ctx, atOut, out); @@ -540,12 +585,14 @@ diopiError_t diopiNonzero(diopiContextHandle_t ctx, diopiTensorHandle_t* out, di diopiError_t diopiLinear(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); auto atWeight = impl::aten::buildATen(weight); auto atBias = impl::aten::buildATen(bias); - at::linear_out(atOut, atInput, atWeight, atBias); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(linear_out, atOut, atInput, atWeight, atBias); return diopiSuccess; } @@ -553,8 +600,8 @@ diopiError_t diopiLinear(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiRoiAlign(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t rois, double spatialScale, int64_t pooledHeight, int64_t pooledWidth, int64_t samplingRatio, bool aligned) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); - auto atRois = impl::aten::buildATen(rois); + auto atInput = impl::aten::buildATenSlow(input); + auto atRois = impl::aten::buildATenSlow(rois); auto atOut = vision::ops::roi_align_forward_kernel(atInput, atRois, spatialScale, pooledHeight, pooledWidth, samplingRatio, aligned); impl::aten::updateATen2Tensor(ctx, atOut, out); @@ -564,9 +611,9 @@ diopiError_t diopiRoiAlign(diopiContextHandle_t ctx, diopiTensorHandle_t out, di diopiError_t diopiSgd(diopiContextHandle_t ctx, diopiTensorHandle_t param, diopiTensorHandle_t grad, diopiTensorHandle_t buf, double learningrate, double momentum, double dampening, double weightDecay, bool nesterov) { impl::aten::setCurStream(ctx); - auto atParam = impl::aten::buildATen(param); - auto atGrad = impl::aten::buildATen(grad); - auto atBuf = impl::aten::buildATen(buf); + auto atParam = impl::aten::buildATenSlow(param); + auto atGrad = impl::aten::buildATenSlow(grad); + auto atBuf = impl::aten::buildATenSlow(buf); atParam.requires_grad_(true); atParam.mutable_grad() = atGrad; @@ -603,7 +650,7 @@ diopiError_t diopiClipGradNorm(diopiContextHandle_t ctx, double* out, diopiTenso bool errorIfNonfinite) { impl::aten::setCurStream(ctx); DIOPI_CHECK(grads != nullptr && out != nullptr, "Not supported: out or parameters is nullptr"); - auto atGrads = impl::aten::buildATenList(grads, num_grads); + auto atGrads = impl::aten::buildATenListSlow(grads, num_grads); at::Tensor total_norm_tensor; if (normType == std::numeric_limits::infinity()) { std::vector norms; @@ -645,7 +692,7 @@ diopiError_t diopiClipGradNorm(diopiContextHandle_t ctx, double* out, diopiTenso diopiError_t diopiEmbeddingRenorm_(diopiContextHandle_t ctx, diopiTensorHandle_t inout, diopiConstTensorHandle_t indices, double max_norm, double norm_type) { impl::aten::setCurStream(ctx); - auto atSelf = impl::aten::buildATen(inout); + auto atSelf = impl::aten::buildATenSlow(inout); auto atIndices = impl::aten::buildATen(indices); at::embedding_renorm_(atSelf, atIndices, max_norm, norm_type); @@ -654,30 +701,34 @@ diopiError_t diopiEmbeddingRenorm_(diopiContextHandle_t ctx, diopiTensorHandle_t diopiError_t diopiEmbedding(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t indices, int64_t paddingIdx, bool scaleGradByFreq, bool sparse) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atWeight = impl::aten::buildATen(weight); auto atIndices = impl::aten::buildATen(indices); auto atOut = impl::aten::buildATen(out); - at::embedding_out(atOut, atWeight, atIndices, paddingIdx, scaleGradByFreq, sparse); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(embedding_out, atOut, atWeight, atIndices, paddingIdx, scaleGradByFreq, sparse); return diopiSuccess; } diopiError_t diopiTril(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t diagonal) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::tril_out(atOut, atInput, diagonal); + CALL_ATEN_CUDA_FUNC(tril_out, atOut, atInput, diagonal); return diopiSuccess; } diopiError_t diopiCat(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t* tensors, int64_t insNum, int64_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(tensors); auto tensorList = impl::aten::buildATenList(tensors, insNum); auto atOut = impl::aten::buildATen(out); - at::cat_out(atOut, tensorList, dim); + CALL_ATEN_CUDA_FUNC(cat_out, atOut, tensorList, dim); return diopiSuccess; } @@ -697,27 +748,29 @@ diopiError_t diopiSplitWithSizes(diopiContextHandle_t ctx, diopiTensorHandle_t* } diopiError_t diopiStack(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t* tensors, int64_t numTensors, int64_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(tensors); auto tensorList = impl::aten::buildATenList(tensors, numTensors); - auto atOut = impl::aten::buildATen(out); - at::stack_out(atOut, tensorList, dim); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(stack_out, atOut, tensorList, dim); return diopiSuccess; } diopiError_t diopiSort(diopiContextHandle_t ctx, diopiTensorHandle_t values, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, int64_t dim, bool descending, const bool* stable) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atValues = impl::aten::buildATen(values); auto atIndices = impl::aten::buildATen(indices); #if TORCH_MM_VERSION <= TORCH_1_8_MM_VERSION - at::sort_out(atValues, atIndices, atInput, dim, descending); + CALL_ATEN_CUDA_FUNC(sort_out, atValues, atIndices, atInput, dim, descending); #else c10::optional atStable = stable ? c10::optional(*stable) : c10::optional(false); - at::sort_out(atValues, atIndices, atInput, atStable, dim, descending); + CALL_ATEN_CUDA_FUNC(sort_out, atValues, atIndices, atInput, atStable, dim, descending); #endif return diopiSuccess; @@ -725,19 +778,20 @@ diopiError_t diopiSort(diopiContextHandle_t ctx, diopiTensorHandle_t values, dio diopiError_t diopiTopk(diopiContextHandle_t ctx, diopiTensorHandle_t values, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, int64_t k, int64_t dim, bool largest, bool sorted) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atValues = impl::aten::buildATen(values); auto atIndices = impl::aten::buildATen(indices); - at::topk_out(atValues, atIndices, atInput, k, dim, largest, sorted); + CALL_ATEN_CUDA_FUNC(topk_out, atValues, atIndices, atInput, k, dim, largest, sorted); return diopiSuccess; } diopiError_t diopiTranspose(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim0, int64_t dim1) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - auto atInput = impl::aten::buildATen(input); + at::Tensor atOut = impl::aten::buildATenSlow(out); + auto atInput = impl::aten::buildATenSlow(input); impl::aten::invokeATenFuncRet(ctx, at::transpose, out, atInput, dim0, dim1); return diopiSuccess; @@ -745,7 +799,7 @@ diopiError_t diopiTranspose(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiOneHot(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t numClasses) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); impl::aten::invokeATenFuncRet(ctx, at::one_hot, out, atInput, numClasses); return diopiSuccess; @@ -753,190 +807,202 @@ diopiError_t diopiOneHot(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiWhere(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t condition, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atOut = impl::aten::buildATen(out); auto atCondition = impl::aten::buildATen(condition); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); - at::where_out(atOut, atCondition, atInput, atOther); + CALL_ATEN_CUDA_FUNC(where_out, atOut, atCondition, atInput, atOther); return diopiSuccess; } diopiError_t diopiSin(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::sin_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(sin_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiSinInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::sin_(atInput); return diopiSuccess; } diopiError_t diopiCos(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::cos_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(cos_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiCosInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::cos_(atInput); return diopiSuccess; } diopiError_t diopiAbs(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::abs_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(abs_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiAbsInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::abs_(atInput); return diopiSuccess; } diopiError_t diopiSqrt(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::sqrt_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(sqrt_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiSqrtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::sqrt_(atInput); return diopiSuccess; } diopiError_t diopiRsqrt(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::rsqrt_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(rsqrt_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiRsqrtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::rsqrt_(atInput); return diopiSuccess; } diopiError_t diopiFloor(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::floor_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(floor_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiFloorInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::floor_(atInput); return diopiSuccess; } diopiError_t diopiNeg(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::neg_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(neg_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiNegInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::neg_(atInput); return diopiSuccess; } diopiError_t diopiSign(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::sign_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(sign_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiTanh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::tanh_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(tanh_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiTanhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::tanh_(atInput); return diopiSuccess; } diopiError_t diopiAtan(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::atan_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(atan_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiAtanInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::atan_(atInput); return diopiSuccess; } diopiError_t diopiSigmoid(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::sigmoid_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(sigmoid_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiSigmoidInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::sigmoid_(atInput); return diopiSuccess; @@ -944,634 +1010,697 @@ diopiError_t diopiSigmoidInp(diopiContextHandle_t ctx, diopiTensorHandle_t input diopiError_t diopiSiluInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::silu_(atInput); return diopiSuccess; } diopiError_t diopiSilu(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::silu_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(silu_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiSiluBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); - at::silu_backward_out(atGradInput, atGradOutput, atInput); + CALL_ATEN_CUDA_FUNC(silu_backward_out, atGradInput, atGradOutput, atInput); return diopiSuccess; } diopiError_t diopiExp(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::exp_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(exp_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiExpInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::exp_(atInput); return diopiSuccess; } diopiError_t diopiLog(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::log_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(log_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiLogInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::log_(atInput); return diopiSuccess; } diopiError_t diopiLog2(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::log2_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(log2_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiLog2Inp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::log2_(atInput); return diopiSuccess; } diopiError_t diopiLog10(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::log10_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(log10_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiLog10Inp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::log10_(atInput); return diopiSuccess; } diopiError_t diopiErf(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::erf_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(erf_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiErfInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::erf_(atInput); return diopiSuccess; } diopiError_t diopiPowScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, const diopiScalar_t* input, diopiConstTensorHandle_t exponent) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atExponent = impl::aten::buildATen(exponent); at::Scalar atInput = impl::aten::buildAtScalar(input); at::Tensor atOut = impl::aten::buildATen(out); - at::pow_out(atOut, atInput, atExponent); + CALL_ATEN_CUDA_FUNC(pow_out, atOut, atInput, atExponent); return diopiSuccess; } diopiError_t diopiPow(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* exponent) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atExponent = impl::aten::buildAtScalar(exponent); at::Tensor atOut = impl::aten::buildATen(out); - at::pow_out(atOut, atInput, atExponent); + CALL_ATEN_CUDA_FUNC(pow_out, atOut, atInput, atExponent); return diopiSuccess; } diopiError_t diopiPowInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* exponent) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atExponent = impl::aten::buildAtScalar(exponent); - atInput.pow_(atExponent); + CALL_ATEN_CUDA_FUNC(pow_, atInput, atExponent); return diopiSuccess; } diopiError_t diopiPowTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t exponent) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atExponent = impl::aten::buildATen(exponent); at::Tensor atOut = impl::aten::buildATen(out); - at::pow_out(atOut, atInput, atExponent); + CALL_ATEN_CUDA_FUNC(pow_out, atOut, atInput, atExponent); return diopiSuccess; } diopiError_t diopiPowInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t exponent) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atExponent = impl::aten::buildATen(exponent); - atInput.pow_(atExponent); + CALL_ATEN_CUDA_FUNC(pow_, atInput, atExponent); return diopiSuccess; } diopiError_t diopiAdd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other, const diopiScalar_t* alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); at::Tensor atOut = impl::aten::buildATen(out); - at::add_out(atOut, atInput, atOther, atAlpha); + CALL_ATEN_CUDA_FUNC(add_out, atOut, atInput, atOther, atAlpha); return diopiSuccess; } diopiError_t diopiAddInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other, const diopiScalar_t* alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - atInput.add_(atOther, atAlpha); + CALL_ATEN_CUDA_FUNC(add_, atInput, atOther, atAlpha); return diopiSuccess; } diopiError_t diopiAddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); at::Tensor atOut = impl::aten::buildATen(out); - at::add_out(atOut, atInput, c10::scalar_to_tensor(atOther), atAlpha); + CALL_ATEN_CUDA_FUNC(add_out, atOut, atInput, c10::scalar_to_tensor(atOther), atAlpha); return diopiSuccess; } diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - atInput.add_(atOther, atAlpha); + CALL_ATEN_CUDA_FUNC(add_, atInput, c10::scalar_to_tensor(atOther), atAlpha); return diopiSuccess; } diopiError_t diopiSub(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other, const diopiScalar_t* alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); at::Tensor atOut = impl::aten::buildATen(out); - at::sub_out(atOut, atInput, atOther, atAlpha); + CALL_ATEN_CUDA_FUNC(sub_out, atOut, atInput, atOther, atAlpha); return diopiSuccess; } diopiError_t diopiSubInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other, const diopiScalar_t* alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - atInput.sub_(atOther, atAlpha); + CALL_ATEN_CUDA_FUNC(sub_, atInput, atOther, atAlpha); return diopiSuccess; } diopiError_t diopiSubScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); at::Tensor atOut = impl::aten::buildATen(out); - at::sub_out(atOut, atInput, c10::scalar_to_tensor(atOther), atAlpha); + CALL_ATEN_CUDA_FUNC(sub_out, atOut, atInput, c10::scalar_to_tensor(atOther), atAlpha); return diopiSuccess; } diopiError_t diopiSubInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - atInput.sub_(atOther, atAlpha); + CALL_ATEN_CUDA_FUNC(sub_, atInput, c10::scalar_to_tensor(atOther), atAlpha); return diopiSuccess; } diopiError_t diopiMul(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::mul_out(atOut, atInput, atOther); + std::cout << "input: " << atInput.device() << " type: " << typeid(atInput.unsafeGetTensorImpl()).name()<< std::endl; + std::cout << "other: " << atOther.device() << " type: " << typeid(atOther.unsafeGetTensorImpl()).name()<< std::endl; + std::cout << "out: " << atOut.device() << " type: " << typeid(atOut.unsafeGetTensorImpl()).name()<< std::endl; + CALL_ATEN_CUDA_FUNC(mul_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiMulInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.mul_(atOther); + CALL_ATEN_CUDA_FUNC(mul_, atInput, atOther); return diopiSuccess; } diopiError_t diopiMulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Tensor atOut = impl::aten::buildATen(out); - at::mul_out(atOut, atInput, c10::scalar_to_tensor(atOther)); + CALL_ATEN_CUDA_FUNC(mul_out, atOut, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.mul_(atOther); + CALL_ATEN_CUDA_FUNC(mul_, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiGe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::ge_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(ge_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiGeInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.ge_(atOther); + CALL_ATEN_CUDA_FUNC(ge_, atInput, atOther); return diopiSuccess; } diopiError_t diopiGeScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Tensor atOut = impl::aten::buildATen(out); - at::ge_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(ge_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiGeInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.ge_(atOther); + CALL_ATEN_CUDA_FUNC(ge_, atInput, atOther); return diopiSuccess; } diopiError_t diopiGt(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::gt_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(gt_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiGtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.gt_(atOther); + CALL_ATEN_CUDA_FUNC(gt_, atInput, atOther); return diopiSuccess; } diopiError_t diopiGtScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Tensor atOut = impl::aten::buildATen(out); - at::gt_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(gt_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiGtInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.gt_(atOther); + CALL_ATEN_CUDA_FUNC(gt_, atInput, atOther); return diopiSuccess; } diopiError_t diopiLe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::le_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(le_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLeInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.le_(atOther); + CALL_ATEN_CUDA_FUNC(le_, atInput, atOther); return diopiSuccess; } diopiError_t diopiLeScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Tensor atOut = impl::aten::buildATen(out); - at::le_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(le_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLeInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.le_(atOther); + CALL_ATEN_CUDA_FUNC(le_, atInput, atOther); return diopiSuccess; } diopiError_t diopiLt(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::lt_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(lt_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.lt_(atOther); + CALL_ATEN_CUDA_FUNC(lt_, atInput, atOther); return diopiSuccess; } diopiError_t diopiLtScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Tensor atOut = impl::aten::buildATen(out); - at::lt_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(lt_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLtInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.lt_(atOther); + CALL_ATEN_CUDA_FUNC(lt_, atInput, atOther); return diopiSuccess; } diopiError_t diopiEq(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::eq_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(eq_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiEqInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.eq_(atOther); + CALL_ATEN_CUDA_FUNC(eq_, atInput, atOther); return diopiSuccess; } diopiError_t diopiEqScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Tensor atOut = impl::aten::buildATen(out); - at::eq_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(eq_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiEqInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.eq_(atOther); + CALL_ATEN_CUDA_FUNC(eq_, atInput, atOther); return diopiSuccess; } diopiError_t diopiNe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::ne_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(ne_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiNeInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.ne_(atOther); + CALL_ATEN_CUDA_FUNC(ne_, atInput, atOther); return diopiSuccess; } diopiError_t diopiNeScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Tensor atOut = impl::aten::buildATen(out); - at::ne_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(ne_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiNeInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.ne_(atOther); + CALL_ATEN_CUDA_FUNC(ne_, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseAnd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::bitwise_and_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(bitwise_and_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseAndInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.bitwise_and_(atOther); + CALL_ATEN_CUDA_FUNC(bitwise_and_, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseAndScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Tensor atOut = impl::aten::buildATen(out); - at::bitwise_and_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(bitwise_and_out, atOut, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiBitwiseAndInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.bitwise_and_(atOther); + CALL_ATEN_CUDA_FUNC(bitwise_and_, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiBitwiseOr(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::bitwise_or_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(bitwise_or_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseOrInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.bitwise_or_(atOther); + CALL_ATEN_CUDA_FUNC(bitwise_or_, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseOrScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); at::Tensor atOut = impl::aten::buildATen(out); - at::bitwise_or_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(bitwise_or_out, atOut, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiBitwiseOrInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.bitwise_or_(atOther); + CALL_ATEN_CUDA_FUNC(bitwise_or_, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiLogicalAnd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::logical_and_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(logical_and_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLogicalAndInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.logical_and_(atOther); + CALL_ATEN_CUDA_FUNC(logical_and_out, atInput, atInput, atOther); return diopiSuccess; } diopiError_t diopiLogicalOr(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); at::Tensor atOut = impl::aten::buildATen(out); - at::logical_or_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(logical_or_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLogicalOrInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOther = impl::aten::buildATen(other); - atInput.logical_or_(atOther); + CALL_ATEN_CUDA_FUNC(logical_or_out, atInput, atInput, atOther); return diopiSuccess; } diopiError_t diopiClampInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* min, const diopiScalar_t* max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); c10::optional atMin = c10::optional(); if (min != nullptr) { atMin = impl::aten::buildAtScalar(min); @@ -1587,6 +1716,7 @@ diopiError_t diopiClampInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t i diopiError_t diopiClampScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* min, const diopiScalar_t* max) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); c10::optional atMin = c10::optional(); @@ -1598,14 +1728,14 @@ diopiError_t diopiClampScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, atMax = impl::aten::buildAtScalar(max); } at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_out(atOut, atInput, atMin, atMax); + CALL_ATEN_CUDA_FUNC(clamp_out, atOut, atInput, atMin, atMax); return diopiSuccess; } diopiError_t diopiClampMaxInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::Scalar atMax = impl::aten::buildAtScalar(max); at::clamp_max_(atInput, atMax); @@ -1613,11 +1743,12 @@ diopiError_t diopiClampMaxInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_ } diopiError_t diopiClampMaxScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* max) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atMax = impl::aten::buildAtScalar(max); at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_max_out(atOut, atInput, atMax); + CALL_ATEN_CUDA_FUNC(clamp_max_out, atOut, atInput, atMax); return diopiSuccess; } @@ -1625,14 +1756,14 @@ diopiError_t diopiClampMaxScalar(diopiContextHandle_t ctx, diopiTensorHandle_t o #if TORCH_MM_VERSION > TORCH_1_9_MM_VERSION diopiError_t diopiClampInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t min, diopiConstTensorHandle_t max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); c10::optional atMin = c10::optional(); if (min != nullptr) { - atMin = impl::aten::buildATen(min); + atMin = impl::aten::buildATenSlow(min); } c10::optional atMax = c10::optional(); if (max != nullptr) { - atMax = impl::aten::buildATen(max); + atMax = impl::aten::buildATenSlow(max); } at::clamp_(atInput, atMin, atMax); @@ -1641,6 +1772,7 @@ diopiError_t diopiClampInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiError_t diopiClamp(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t min, diopiConstTensorHandle_t max) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); c10::optional atMin = c10::optional(); @@ -1652,45 +1784,47 @@ diopiError_t diopiClamp(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopi atMax = impl::aten::buildATen(max); } at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_out(atOut, atInput, atMin, atMax); + CALL_ATEN_CUDA_FUNC(clamp_out, atOut, atInput, atMin, atMax); return diopiSuccess; } diopiError_t diopiClampMaxInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atMax = impl::aten::buildATen(max); + at::Tensor atInput = impl::aten::buildATenSlow(input); + at::Tensor atMax = impl::aten::buildATenSlow(max); at::clamp_max_(atInput, atMax); return diopiSuccess; } diopiError_t diopiClampMax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t max) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atMax = impl::aten::buildATen(max); at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_max_out(atOut, atInput, atMax); + CALL_ATEN_CUDA_FUNC(clamp_max_out, atOut, atInput, atMax); return diopiSuccess; } diopiError_t diopiClampMinInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t min) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atMin = impl::aten::buildATen(min); + at::Tensor atInput = impl::aten::buildATenSlow(input); + at::Tensor atMin = impl::aten::buildATenSlow(min); at::clamp_(atInput, atMin); return diopiSuccess; } diopiError_t diopiClampMin(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t min) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atMin = impl::aten::buildATen(min); at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_out(atOut, atInput, atMin); + CALL_ATEN_CUDA_FUNC(clamp_out, atOut, atInput, atMin); return diopiSuccess; } @@ -1698,7 +1832,7 @@ diopiError_t diopiClampMin(diopiContextHandle_t ctx, diopiTensorHandle_t out, di diopiError_t diopiClampMinInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* min) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::Scalar atMin = impl::aten::buildAtScalar(min); at::clamp_(atInput, atMin); @@ -1706,18 +1840,19 @@ diopiError_t diopiClampMinInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_ } diopiError_t diopiClampMinScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* min) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Scalar atMin = impl::aten::buildAtScalar(min); at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_out(atOut, atInput, atMin); + CALL_ATEN_CUDA_FUNC(clamp_out, atOut, atInput, atMin); return diopiSuccess; } diopiError_t diopiFill(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* value) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::Scalar atValue = impl::aten::buildAtScalar(value); at::fill_(atInput, atValue); @@ -1725,18 +1860,19 @@ diopiError_t diopiFill(diopiContextHandle_t ctx, diopiTensorHandle_t input, cons } diopiError_t diopiAdaptiveAvgPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); at::Tensor atOut = impl::aten::buildATen(out); - at::adaptive_avg_pool2d_out(atOut, atInput, atOutSize); + CALL_ATEN_CUDA_FUNC(adaptive_avg_pool2d_out, atOut, atInput, atOutSize); return diopiSuccess; } diopiError_t diopiAdaptiveMaxPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); auto atOuts = at::adaptive_max_pool2d(atInput, atOutSize); impl::aten::updateATen2Tensor(ctx, std::get<0>(atOuts), out); @@ -1746,30 +1882,33 @@ diopiError_t diopiAdaptiveMaxPool2d(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiAdaptiveMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, diopiSize_t output_size) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atIndices = impl::aten::buildATen(indices); - at::adaptive_max_pool2d_out(atOut, atIndices, atInput, atOutSize); + CALL_ATEN_CUDA_FUNC(adaptive_max_pool2d_out, atOut, atIndices, atInput, atOutSize); return diopiSuccess; } diopiError_t diopiAdaptiveMaxPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t indices) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atGradOutput = impl::aten::buildATen(grad_output); at::Tensor atIndices = impl::aten::buildATen(indices); at::Tensor atGradInput = impl::aten::buildATen(grad_input); - at::adaptive_max_pool2d_backward_out(atGradInput, atGradOutput, atInput, atIndices); + CALL_ATEN_CUDA_FUNC(adaptive_max_pool2d_backward_out, atGradInput, atGradOutput, atInput, atIndices); return diopiSuccess; } diopiError_t diopiAvgPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, bool ceil_mode, bool count_include_pad, const int64_t* divisor_override) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); @@ -1777,7 +1916,7 @@ diopiError_t diopiAvgPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, d at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); c10::optional atDivisorOverride = divisor_override ? c10::optional(*divisor_override) : c10::nullopt; at::Tensor atOut = impl::aten::buildATen(out); - at::avg_pool2d_out(atOut, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride); + CALL_ATEN_CUDA_FUNC(avg_pool2d_out, atOut, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride); return diopiSuccess; } @@ -1785,16 +1924,17 @@ diopiError_t diopiAvgPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiDropout(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t mask, diopiConstTensorHandle_t input, double p, bool train, diopiGeneratorHandle_t generator) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); if (train) { at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atMask = impl::aten::buildATen(mask); + at::Tensor atOut = impl::aten::buildATenSlow(out); + at::Tensor atMask = impl::aten::buildATenSlow(mask); if (atInput.numel() == atMask.numel()) { - at::_fused_dropout_out(atOut, atMask, atInput, 1 - p, gen); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(_fused_dropout_out, atOut, atMask, atInput, 1 - p, gen); } else { atMask.bernoulli_(1 - p, gen); - at::mul_out(atOut, atInput, atMask); + CALL_ATEN_CUDA_FUNC(mul_out, atOut, atInput, atMask); atOut.div_(1 - p); } impl::aten::updateGeneratorHandleState(ctx, gen, generator); @@ -1810,10 +1950,11 @@ diopiError_t diopiDropoutInp(diopiContextHandle_t ctx, diopiTensorHandle_t input impl::aten::setCurStream(ctx); if (train) { at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atMask = impl::aten::buildATen(mask); + at::Tensor atInput = impl::aten::buildATenSlow(input); + at::Tensor atMask = impl::aten::buildATenSlow(mask); if (atInput.numel() == atMask.numel()) { - at::_fused_dropout_out(atInput, atMask, atInput, 1 - p, gen); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(_fused_dropout_out, atInput, atMask, atInput, 1 - p, gen); } else { atMask.bernoulli_(1 - p, gen); atInput.mul_(atMask).div_(1 - p); @@ -1827,14 +1968,14 @@ diopiError_t diopiDropoutInp(diopiContextHandle_t ctx, diopiTensorHandle_t input diopiError_t diopiMSELoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atTarget = impl::aten::buildATen(target); + at::Tensor atOut = impl::aten::buildATenSlow(out); + at::Tensor atInput = impl::aten::buildATenSlow(input); + at::Tensor atTarget = impl::aten::buildATenSlow(target); // Note(huqingqing): at::mse_loss_out reduce in the 0 dimension, which is different from at::mse_loss. // at::mse_loss reduce over all the dimensions. if (reduction == 0) { - at::Tensor atOut = impl::aten::buildATen(out); - at::mse_loss_out(atOut, atInput, atTarget, reduction); + at::Tensor atOut = impl::aten::buildATenSlow(out); + CALL_ATEN_CUDA_FUNC(mse_loss_out, atOut, atInput, atTarget, reduction); } else { impl::aten::invokeATenFuncRet(ctx, at::mse_loss, out, atInput, atTarget, reduction); } @@ -1845,19 +1986,19 @@ diopiError_t diopiMSELoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio diopiError_t diopiSigmoidFocalLoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t inputs, diopiConstTensorHandle_t targets, float alpha, float gamma, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(inputs); - at::Tensor atTarget = impl::aten::buildATen(targets); + at::Tensor atInput = impl::aten::buildATenSlow(inputs); + at::Tensor atTarget = impl::aten::buildATenSlow(targets); at::Tensor atP = at::sigmoid(atInput); at::Tensor atTerm1 = at::pow(1 - atP, gamma) * at::log(atP); at::Tensor atTerm2 = at::pow(atP, gamma) * at::log(1 - atP); at::Tensor atRes = -atTarget * atTerm1 * alpha - (1 - atTarget) * atTerm2 * (1 - alpha); - at::Tensor atOut = impl::aten::buildATen(out); + at::Tensor atOut = impl::aten::buildATenSlow(out); if (reduction == 0) { impl::aten::updateATen2Tensor(ctx, atRes, out); } else if (reduction == 1) { - at::mean_out(atOut, atRes, impl::aten::getSequence(atRes.dim())); + CALL_ATEN_CUDA_FUNC(mean_out, atOut, atRes, impl::aten::getSequence(atRes.dim())); } else if (reduction == 2) { - at::sum_out(atOut, atRes, impl::aten::getSequence(atRes.dim())); + CALL_ATEN_CUDA_FUNC(sum_out, atOut, atRes, impl::aten::getSequence(atRes.dim())); } else { NOT_SUPPORTED("sigmoid reduction type"); return diopiErrorOccurred; @@ -1869,6 +2010,7 @@ diopiError_t diopiSigmoidFocalLoss(diopiContextHandle_t ctx, diopiTensorHandle_t diopiError_t diopiBatchNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t save_mean, diopiTensorHandle_t save_invstd, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiTensorHandle_t running_mean, diopiTensorHandle_t running_var, bool training, double momentum, double eps) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atWeight = impl::aten::buildATen(weight); @@ -1878,7 +2020,15 @@ diopiError_t diopiBatchNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, d at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atSaveMean = impl::aten::buildATen(save_mean); at::Tensor atSaveInvstd = impl::aten::buildATen(save_invstd); - at::native_batch_norm_out(atOut, atSaveMean, atSaveInvstd, atInput, atWeight, atBias, atRunningMean, atRunningVar, training, momentum, eps); + std::cout << "atInput: device: " << atInput.device() << std::endl; + std::cout << "atWeight: device: " << atWeight.device() << std::endl; + std::cout << "atBias: device: " << atBias.device() << std::endl; + std::cout << "atRunningMean: device: " << atRunningMean.device() << std::endl; + std::cout << "atRunningVar: device: " << atRunningVar.device() << std::endl; + std::cout << "atOut: device: " << atOut.device() << std::endl; + std::cout << "atSaveMean: device: " << atSaveMean.device() << std::endl; + std::cout << "atSaveInvstd: device: " << atSaveInvstd.device() << std::endl; + CALL_ATEN_CUDA_FUNC(native_batch_norm_out, atOut, atSaveMean, atSaveInvstd, atInput, atWeight, atBias, atRunningMean, atRunningVar, training, momentum, eps); return diopiSuccess; } @@ -1886,7 +2036,7 @@ diopiError_t diopiBatchNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiSlice(diopiContextHandle_t ctx, diopiTensorHandle_t null_out, diopiConstTensorHandle_t input, int64_t dim, int64_t start, int64_t end, int64_t step) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::Tensor atOut = at::slice(atInput, dim, start, end, step).contiguous(); impl::aten::updateATen2Tensor(ctx, atOut, null_out); @@ -1896,14 +2046,14 @@ diopiError_t diopiSlice(diopiContextHandle_t ctx, diopiTensorHandle_t null_out, diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t* indices, int64_t nums) { impl::aten::setCurStream(ctx); DIOPI_CHECK(out != nullptr && indices != nullptr, "Not supported: out or indices is nullptr"); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); c10::List> vecIdx; vecIdx.reserve(nums); for (size_t i = 0; i < nums; ++i) { if (indices[i] == nullptr) { vecIdx.emplace_back(c10::nullopt); } else { - vecIdx.emplace_back(impl::aten::buildATen(indices[i])); + vecIdx.emplace_back(impl::aten::buildATenSlow(indices[i])); } } at::Tensor atOut = at::index(atInput, vecIdx).contiguous(); @@ -1914,6 +2064,7 @@ diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diop diopiError_t diopiBCEWithLogits(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t pos_weight, diopiReduction_t reduction) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atInput = impl::aten::buildATen(input); @@ -1921,26 +2072,28 @@ diopiError_t diopiBCEWithLogits(diopiContextHandle_t ctx, diopiTensorHandle_t ou c10::optional atWeight = weight ? c10::optional(impl::aten::buildATen(weight)) : c10::nullopt; c10::optional atPosWeight = pos_weight ? c10::optional(impl::aten::buildATen(pos_weight)) : c10::nullopt; - at::binary_cross_entropy_with_logits_out(atOut, atInput, atTarget, atWeight, atPosWeight, reduction); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(binary_cross_entropy_with_logits_out, atOut, atInput, atTarget, atWeight, atPosWeight, reduction); return diopiSuccess; } diopiError_t diopiHardtanh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* min_val, const diopiScalar_t* max_val) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMin = impl::aten::buildAtScalar(min_val); auto atMax = impl::aten::buildAtScalar(max_val); auto atOut = impl::aten::buildATen(out); - at::hardtanh_out(atOut, atInput, atMin, atMax); + CALL_ATEN_CUDA_FUNC(hardtanh_out, atOut, atInput, atMin, atMax); return diopiSuccess; } diopiError_t diopiHardtanhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* min_val, const diopiScalar_t* max_val) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); auto atMin = impl::aten::buildAtScalar(min_val); auto atMax = impl::aten::buildAtScalar(max_val); at::hardtanh_(atInput, atMin, atMax); @@ -1949,46 +2102,50 @@ diopiError_t diopiHardtanhInp(diopiContextHandle_t ctx, diopiTensorHandle_t inpu } diopiError_t diopiHardswish(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::hardswish_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(hardswish_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiHardswishInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::hardswish_(atInput); return diopiSuccess; } diopiError_t diopiHardswishBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); - at::hardswish_backward_out(atGradInput, atGradOutput, atInput); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(hardswish_backward_out, atGradInput, atGradOutput, atInput); return diopiSuccess; } diopiError_t diopiThreshold(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* threshold, const diopiScalar_t* value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atThreshold = impl::aten::buildAtScalar(threshold); auto atValue = impl::aten::buildAtScalar(value); auto atOut = impl::aten::buildATen(out); - at::threshold_out(atOut, atInput, atThreshold, atValue); + CALL_ATEN_CUDA_FUNC(threshold_out, atOut, atInput, atThreshold, atValue); return diopiSuccess; } diopiError_t diopiThresholdInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* threshold, const diopiScalar_t* value) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); auto atThreshold = impl::aten::buildAtScalar(threshold); auto atValue = impl::aten::buildAtScalar(value); at::threshold_(atInput, atThreshold, atValue); @@ -1997,17 +2154,19 @@ diopiError_t diopiThresholdInp(diopiContextHandle_t ctx, diopiTensorHandle_t inp } diopiError_t diopiGelu(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const char* approximate) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); c10::string_view atApproximate(approximate, strlen(approximate)); - at::gelu_out(atOut, atInput, atApproximate); + CALL_ATEN_CUDA_FUNC(gelu_out, atOut, atInput, atApproximate); return diopiSuccess; } diopiError_t diopiNLLLoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiReduction_t reduction, int64_t ignore_index) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); @@ -2032,9 +2191,11 @@ diopiError_t diopiNLLLoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio } if (dim >= 3) { - at::nll_loss2d_out(atOut, atInput, atTarget, atWeight, reduction, ignore_index); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(nll_loss2d_out, atOut, atInput, atTarget, atWeight, reduction, ignore_index); } else { - at::nll_loss_out(atOut, atInput, atTarget, atWeight, reduction, ignore_index); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(nll_loss_out, atOut, atInput, atTarget, atWeight, reduction, ignore_index); } return diopiSuccess; @@ -2042,17 +2203,20 @@ diopiError_t diopiNLLLoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio diopiError_t diopiSliceBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiSize_t input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::IntArrayRef atInputSizes = impl::aten::buildAtIntArray(input_sizes); at::Tensor atGradOutput = impl::aten::buildATen(grad_output); at::Tensor atGradInput = impl::aten::buildATen(grad_input); - at::slice_backward_out(atGradInput, atGradOutput, atInputSizes, dim, start, end, step); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(slice_backward_out, atGradInput, atGradOutput, atInputSizes, dim, start, end, step); return diopiSuccess; } diopiError_t diopiIndexBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t zeros_like_input, diopiConstTensorHandle_t* indices, int64_t nums, diopiConstTensorHandle_t grad) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(indices); at::Tensor atZerosInput = impl::aten::buildATen(zeros_like_input); @@ -2068,7 +2232,8 @@ diopiError_t diopiIndexBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gr vecIdx.emplace_back(atIndex); } } - at::_index_put_impl_out(atGradInput, atZerosInput, vecIdx, atGrad, true, true); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(_index_put_impl_out, atGradInput, atZerosInput, vecIdx, atGrad, true, true); return diopiSuccess; } @@ -2077,9 +2242,9 @@ diopiError_t diopiSigmoidFocalLossBackward(diopiContextHandle_t ctx, diopiTensor diopiConstTensorHandle_t target, diopiTensorHandle_t grad_input, float gamma, float alpha, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atTarget = impl::aten::buildATen(target); - at::Tensor atGrad = impl::aten::buildATen(grad_output); + at::Tensor atInput = impl::aten::buildATenSlow(input); + at::Tensor atTarget = impl::aten::buildATenSlow(target); + at::Tensor atGrad = impl::aten::buildATenSlow(grad_output); at::Tensor atGradOutput = at::empty_like(atInput); if (reduction == 1) { atGradOutput.copy_(atGrad.expand_as(atInput) / atInput.numel()); @@ -2108,8 +2273,8 @@ diopiError_t diopiRoiAlignBackward(diopiContextHandle_t ctx, diopiTensorHandle_t double spatialScale, int64_t pooledHeight, int64_t pooledWidth, int64_t batchSize, int64_t channels, int64_t height, int64_t width, int64_t samplingRatio, bool aligned) { impl::aten::setCurStream(ctx); - auto atGrad = impl::aten::buildATen(grad); - auto atRois = impl::aten::buildATen(rois); + auto atGrad = impl::aten::buildATenSlow(grad); + auto atRois = impl::aten::buildATenSlow(rois); auto atOut = vision::ops::roi_align_backward_kernel( atGrad, atRois, spatialScale, pooledHeight, pooledWidth, batchSize, channels, height, width, samplingRatio, aligned); impl::aten::updateATen2Tensor(ctx, atOut, out); @@ -2122,9 +2287,9 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan diopiConstTensorHandle_t weight, diopiSize_t* bias_sizes, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, int64_t groups) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); - auto atGrad = impl::aten::buildATen(grad_output); - auto atWeight = impl::aten::buildATen(weight); + auto atInput = impl::aten::buildATenSlow(input); + auto atGrad = impl::aten::buildATenSlow(grad_output); + auto atWeight = impl::aten::buildATenSlow(weight); auto atStride = impl::aten::buildAtIntArray(stride); auto atPadding = impl::aten::buildAtIntArray(padding); auto atDilation = impl::aten::buildAtIntArray(dilation); @@ -2134,14 +2299,14 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan impl::aten::invokeATenFuncRet( ctx, at::miopen_convolution_backward, vecOut, atInput, atGrad, atWeight, atPadding, atStride, atDilation, groups, false, false, grad_input_mask); if (bias_sizes && grad_bias) { - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::Tensor atTmp = atGrad; int64_t size = atGrad.dim() - 1; while (atGradBias.dim() != size) { atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } #else std::vector outputPadding(padding.len, 0); @@ -2149,9 +2314,9 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan // TODO(ywt): when pytorch fix the bug of empty tensor, remove the // check of grad_input && grad_weight auto atBiasSizes = impl::aten::buildAtIntArray(bias_sizes); - auto atGradInput = impl::aten::buildATen(grad_input); - auto atGradWeight = impl::aten::buildATen(grad_weight); - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradInput = impl::aten::buildATenSlow(grad_input); + auto atGradWeight = impl::aten::buildATenSlow(grad_weight); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::convolution_backward_out(atGradInput, atGradWeight, atGradBias, @@ -2172,14 +2337,14 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan impl::aten::updateATen2Tensor(ctx, std::get<0>(results), grad_input); impl::aten::updateATen2Tensor(ctx, std::get<1>(results), grad_weight); if (bias_sizes && grad_bias) { - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::Tensor atTmp = atGrad; int64_t size = atGrad.dim() - 1; while (atGradBias.dim() != size) { atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } } #endif @@ -2192,9 +2357,9 @@ diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorH diopiConstTensorHandle_t weight, diopiSize_t* bias_sizes, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, diopiSize_t output_padding, int64_t groups) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); - auto atGrad = impl::aten::buildATen(grad_output); - auto atWeight = impl::aten::buildATen(weight); + auto atInput = impl::aten::buildATenSlow(input); + auto atGrad = impl::aten::buildATenSlow(grad_output); + auto atWeight = impl::aten::buildATenSlow(weight); auto atStride = impl::aten::buildAtIntArray(stride); auto atPadding = impl::aten::buildAtIntArray(padding); auto atOutputPadding = impl::aten::buildAtIntArray(output_padding); @@ -2216,21 +2381,21 @@ diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorH false, grad_input_mask); if (bias_sizes != nullptr && grad_bias != nullptr) { - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::Tensor atTmp = atGrad; int64_t size = atGrad.dim() - 1; while (atGradBias.dim() != size) { atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } #else if (grad_input && grad_weight && grad_bias && bias_sizes) { auto atBiasSizes = impl::aten::buildAtIntArray(bias_sizes); - auto atGradInput = impl::aten::buildATen(grad_input); - auto atGradWeight = impl::aten::buildATen(grad_weight); - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradInput = impl::aten::buildATenSlow(grad_input); + auto atGradWeight = impl::aten::buildATenSlow(grad_weight); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::convolution_backward_out(atGradInput, atGradWeight, atGradBias, @@ -2251,14 +2416,14 @@ diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorH impl::aten::updateATen2Tensor(ctx, std::get<0>(grad_inputs), grad_input); impl::aten::updateATen2Tensor(ctx, std::get<1>(grad_inputs), grad_weight); if (bias_sizes != nullptr && grad_bias != nullptr) { - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::Tensor atTmp = atGrad; int64_t size = atGrad.dim() - 1; while (atGradBias.dim() != size) { atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } } #endif @@ -2269,8 +2434,8 @@ diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorH diopiError_t diopiEmbeddingBackward(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t grad, diopiConstTensorHandle_t indices, int64_t numWeights, int64_t paddingIdx, bool scaleGradByFreq, bool sparse) { impl::aten::setCurStream(ctx); - auto atGrad = impl::aten::buildATen(grad); - auto atIndices = impl::aten::buildATen(indices); + auto atGrad = impl::aten::buildATenSlow(grad); + auto atIndices = impl::aten::buildATenSlow(indices); impl::aten::invokeATenFuncRet(ctx, at::embedding_backward, out, atGrad, atIndices, numWeights, paddingIdx, scaleGradByFreq, sparse); return diopiSuccess; @@ -2278,46 +2443,48 @@ diopiError_t diopiEmbeddingBackward(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiAdaptiveAvgPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); - at::_adaptive_avg_pool2d_backward_out(atGradInput, atGradOutput, atInput); + CALL_ATEN_FUNC(_adaptive_avg_pool2d_backward_out, atGradInput, atGradOutput, atInput); return diopiSuccess; } diopiError_t diopiLeakyReluBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, const diopiScalar_t* negative_slope, bool input_is_result) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atSlope = impl::aten::buildAtScalar(negative_slope); - at::leaky_relu_backward_out(atGradInput, atGradOutput, atInput, atSlope, input_is_result); + CALL_ATEN_CUDA_FUNC(leaky_relu_backward_out, atGradInput, atGradOutput, atInput, atSlope, input_is_result); return diopiSuccess; } diopiError_t diopiHardtanhBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, const diopiScalar_t* min_val, const diopiScalar_t* max_val) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atMin = impl::aten::buildAtScalar(min_val); auto atMax = impl::aten::buildAtScalar(max_val); auto atGradInput = impl::aten::buildATen(grad_input); - at::hardtanh_backward_out(atGradInput, atGradOutput, atInput, atMin, atMax); + CALL_ATEN_CUDA_FUNC(hardtanh_backward_out, atGradInput, atGradOutput, atInput, atMin, atMax); return diopiSuccess; } diopiError_t diopiGeluBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, - const char* approximate) { - impl::aten::setCurStream(ctx); - auto atGradInput = impl::aten::buildATen(grad_input); - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); + const char* approximate) { + auto atGradInput = impl::aten::buildATenSlow(grad_input); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); c10::string_view atApproximate(approximate, strlen(approximate)); impl::aten::invokeATenFuncRet(ctx, at::gelu_backward, grad_input, atGradOutput, atInput, atApproximate); @@ -2327,6 +2494,7 @@ diopiError_t diopiGeluBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gra diopiError_t diopiAvgPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, bool ceil_mode, bool count_include_pad, const int64_t* divisor_override) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); @@ -2335,29 +2503,31 @@ diopiError_t diopiAvgPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_ at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); c10::optional atDivisorOverride = divisor_override ? c10::optional(*divisor_override) : c10::nullopt; auto atGradInput = impl::aten::buildATen(grad_input); - at::avg_pool2d_backward_out(atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride); + CALL_ATEN_CUDA_FUNC(avg_pool2d_backward_out, atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride); return diopiSuccess; } diopiError_t diopiMSELossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiReduction_t reduction) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atTarget = impl::aten::buildATen(target); auto atGradInput = impl::aten::buildATen(grad_input); - at::mse_loss_backward_out(atGradInput, atGradOutput, atInput, atTarget, reduction); + CALL_ATEN_CUDA_FUNC(mse_loss_backward_out, atGradInput, atGradOutput, atInput, atTarget, reduction); return diopiSuccess; } diopiError_t diopiTanhBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atGradInput = impl::aten::buildATen(grad_input); - at::tanh_backward_out(atGradInput, atGradOutput, atInput); + CALL_ATEN_CUDA_FUNC(tanh_backward_out, atGradInput, atGradOutput, atInput); return diopiSuccess; } @@ -2365,9 +2535,9 @@ diopiError_t diopiTanhBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gra diopiError_t diopiIndexSelectBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad, diopiSize_t input_sizes, int64_t dim, diopiConstTensorHandle_t index) { impl::aten::setCurStream(ctx); - auto atGrad = impl::aten::buildATen(grad); + auto atGrad = impl::aten::buildATenSlow(grad); at::IntArrayRef atInputSize = impl::aten::buildAtIntArray(input_sizes); - auto atIndex = impl::aten::buildATen(index); + auto atIndex = impl::aten::buildATenSlow(index); impl::aten::invokeATenFuncRet(ctx, at::index_select_backward, grad_input, atGrad, atInputSize, dim, atIndex); return diopiSuccess; @@ -2376,7 +2546,7 @@ diopiError_t diopiIndexSelectBackward(diopiContextHandle_t ctx, diopiTensorHandl diopiError_t diopiSelectBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiSize_t input_sizes, int64_t dim, int64_t index) { impl::aten::setCurStream(ctx); - auto atGradOutput = impl::aten::buildATen(grad_output); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); at::IntArrayRef atInputSize = impl::aten::buildAtIntArray(input_sizes); impl::aten::invokeATenFuncRet(ctx, at::select_backward, grad_input, atGradOutput, atInputSize, dim, index); @@ -2385,47 +2555,51 @@ diopiError_t diopiSelectBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g diopiError_t diopiSoftmaxBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t output, int64_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atOutput = impl::aten::buildATen(output); // TODO(huqingqing): use default type instead - at::_softmax_backward_data_out(atGradInput, atGradOutput, atOutput, dim, atOutput.scalar_type()); + CALL_ATEN_CUDA_FUNC(_softmax_backward_data_out, atGradInput, atGradOutput, atOutput, dim, atOutput.scalar_type()); return diopiSuccess; } diopiError_t diopiLogSoftmaxBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t output, int64_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atOutput = impl::aten::buildATen(output); // TODO(huqingqing): use default type instead - at::_log_softmax_backward_data_out(atGradInput, atGradOutput, atOutput, dim, atOutput.scalar_type()); + CALL_ATEN_CUDA_FUNC(_log_softmax_backward_data_out, atGradInput, atGradOutput, atOutput, dim, atOutput.scalar_type()); return diopiSuccess; } diopiError_t diopiSigmoidBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t output) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atOutput = impl::aten::buildATen(output); auto atGradInput = impl::aten::buildATen(grad_input); - at::sigmoid_backward_out(atGradInput, atGradOutput, atOutput); + CALL_ATEN_CUDA_FUNC(sigmoid_backward_out, atGradInput, atGradOutput, atOutput); return diopiSuccess; } diopiError_t diopiThresholdBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, const diopiScalar_t* threshold) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atThreshold = impl::aten::buildAtScalar(threshold); - at::threshold_backward_out(atGradInput, atGradOutput, atInput, atThreshold); + CALL_ATEN_CUDA_FUNC(threshold_backward_out, atGradInput, atGradOutput, atInput, atThreshold); return diopiSuccess; } @@ -2434,13 +2608,13 @@ diopiError_t diopiBCEWithLogitsBackward(diopiContextHandle_t ctx, diopiTensorHan diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t pos_weight, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); - auto atTarget = impl::aten::buildATen(target); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); + auto atTarget = impl::aten::buildATenSlow(target); at::Tensor atGradInput; if (pos_weight) { - auto atPosWeight = impl::aten::buildATen(pos_weight); + auto atPosWeight = impl::aten::buildATenSlow(pos_weight); // pos_weight need to be broadcasted, thus mul(target) is not inplace. auto atT = atPosWeight.mul(atTarget); atGradInput = atT.add(1).sub_(atTarget).mul_(atInput.sigmoid()).sub_(atT).mul_(atGradOutput); @@ -2449,7 +2623,7 @@ diopiError_t diopiBCEWithLogitsBackward(diopiContextHandle_t ctx, diopiTensorHan } if (weight) { - auto atWeight = impl::aten::buildATen(weight); + auto atWeight = impl::aten::buildATenSlow(weight); atGradInput.mul_(atWeight); } @@ -2465,9 +2639,9 @@ diopiError_t diopiNLLLossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiReduction_t reduction, int64_t ignore_index) { impl::aten::setCurStream(ctx); - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); - auto atTarget = impl::aten::buildATen(target); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); + auto atTarget = impl::aten::buildATenSlow(target); auto atGradInput = impl::aten::nllLossNdBackward(atInput, atGradOutput, atTarget, weight, reduction, ignore_index); impl::aten::updateATen2Tensor(ctx, atGradInput, grad_input); @@ -2477,6 +2651,7 @@ diopiError_t diopiNLLLossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t diopiError_t diopiMaxPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode, diopiConstTensorHandle_t indices) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); @@ -2486,7 +2661,7 @@ diopiError_t diopiMaxPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_ at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); auto atIndices = impl::aten::buildATen(indices); auto atGradInput = impl::aten::buildATen(grad_input); - at::max_pool2d_with_indices_backward_out(atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, atDilation, ceil_mode, atIndices); + CALL_ATEN_CUDA_FUNC(max_pool2d_with_indices_backward_out, atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, atDilation, ceil_mode, atIndices); return diopiSuccess; } @@ -2497,19 +2672,19 @@ diopiError_t diopiBatchNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ diopiConstTensorHandle_t save_invstd, bool training, double eps) { impl::aten::setCurStream(ctx); - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); - auto atWeight = impl::aten::buildATen(weight); - c10::optional atRunningMean = running_mean ? c10::optional(impl::aten::buildATen(running_mean)) : c10::nullopt; - c10::optional atRunningVar = running_var ? c10::optional(impl::aten::buildATen(running_var)) : c10::nullopt; - c10::optional atSaveMean = save_mean ? c10::optional(impl::aten::buildATen(save_mean)) : c10::nullopt; - c10::optional atSaveVar = save_invstd ? c10::optional(impl::aten::buildATen(save_invstd)) : c10::nullopt; + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); + auto atWeight = impl::aten::buildATenSlow(weight); + c10::optional atRunningMean = running_mean ? c10::optional(impl::aten::buildATenSlow(running_mean)) : c10::nullopt; + c10::optional atRunningVar = running_var ? c10::optional(impl::aten::buildATenSlow(running_var)) : c10::nullopt; + c10::optional atSaveMean = save_mean ? c10::optional(impl::aten::buildATenSlow(save_mean)) : c10::nullopt; + c10::optional atSaveVar = save_invstd ? c10::optional(impl::aten::buildATenSlow(save_invstd)) : c10::nullopt; if (grad_input && grad_weight && grad_bias) { auto grad_input_mask = std::array{true, true, true}; - auto atGradInput = impl::aten::buildATen(grad_input); - auto atGradWeight = impl::aten::buildATen(grad_weight); - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradInput = impl::aten::buildATenSlow(grad_input); + auto atGradWeight = impl::aten::buildATenSlow(grad_weight); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::native_batch_norm_backward_out(atGradInput, atGradWeight, atGradBias, @@ -2542,21 +2717,23 @@ diopiError_t diopiBatchNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ } diopiError_t diopiArange(diopiContextHandle_t ctx, diopiTensorHandle_t out, const diopiScalar_t* start, const diopiScalar_t* end, const diopiScalar_t* step) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atStart = impl::aten::buildAtScalar(start); auto atEnd = impl::aten::buildAtScalar(end); auto atStep = impl::aten::buildAtScalar(step); - at::arange_out(atOut, atStart, atEnd, atStep); + CALL_ATEN_CUDA_FUNC(arange_out, atOut, atStart, atEnd, atStep); return diopiSuccess; } diopiError_t diopiRandperm(diopiContextHandle_t ctx, diopiTensorHandle_t out, int64_t n, diopiGeneratorHandle_t generator) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::randperm_out(atOut, n, gen); + CALL_ATEN_CUDA_FUNC(randperm_out, atOut, n, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2564,7 +2741,7 @@ diopiError_t diopiRandperm(diopiContextHandle_t ctx, diopiTensorHandle_t out, in diopiError_t diopiUniformInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, double from, double to, diopiGeneratorHandle_t generator) { impl::aten::setCurStream(ctx); - auto atInOut = impl::aten::buildATen(inout); + auto atInOut = impl::aten::buildATenSlow(inout); at::Generator gen = impl::aten::buildGenerator(ctx, generator); at::native::uniform_(atInOut, from, to, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); @@ -2574,7 +2751,7 @@ diopiError_t diopiUniformInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout diopiError_t diopiRandomInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, int64_t from, const int64_t* to, diopiGeneratorHandle_t generator) { impl::aten::setCurStream(ctx); - auto atInOut = impl::aten::buildATen(inout); + auto atInOut = impl::aten::buildATenSlow(inout); c10::optional atTo = to ? c10::optional(*to) : c10::nullopt; at::Generator gen = impl::aten::buildGenerator(ctx, generator); at::native::random_(atInOut, from, atTo, gen); @@ -2584,21 +2761,23 @@ diopiError_t diopiRandomInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, } diopiError_t diopiBernoulliInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, diopiGeneratorHandle_t generator) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInOut = impl::aten::buildATen(inout); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::bernoulli_out(atInOut, atInOut, gen); + CALL_ATEN_CUDA_FUNC(bernoulli_out, atInOut, atInOut, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; } diopiError_t diopiBernoulli(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiGeneratorHandle_t generator) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::bernoulli_out(atOut, atInput, gen); + CALL_ATEN_CUDA_FUNC(bernoulli_out, atOut, atInput, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2606,7 +2785,7 @@ diopiError_t diopiBernoulli(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiBernoulliScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, double p, diopiGeneratorHandle_t generator) { impl::aten::setCurStream(ctx); - auto atOut = impl::aten::buildATen(out); + auto atOut = impl::aten::buildATenSlow(out); at::Generator gen = impl::aten::buildGenerator(ctx, generator); at::native::bernoulli_(atOut, p, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); @@ -2615,11 +2794,12 @@ diopiError_t diopiBernoulliScalar(diopiContextHandle_t ctx, diopiTensorHandle_t } diopiError_t diopiNormal(diopiContextHandle_t ctx, diopiTensorHandle_t out, double mean, double std, diopiGeneratorHandle_t generator) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atSize = atOut.sizes(); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::normal_out(atOut, mean, std, atSize, gen); + CALL_ATEN_FUNC(normal_out, atOut, mean, std, atSize, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2627,7 +2807,7 @@ diopiError_t diopiNormal(diopiContextHandle_t ctx, diopiTensorHandle_t out, doub diopiError_t diopiNormalInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, double mean, double std, diopiGeneratorHandle_t generator) { impl::aten::setCurStream(ctx); - auto atInOut = impl::aten::buildATen(inout); + auto atInOut = impl::aten::buildATenSlow(inout); at::Generator gen = impl::aten::buildGenerator(ctx, generator); at::native::normal_(atInOut, mean, std, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); @@ -2636,11 +2816,12 @@ diopiError_t diopiNormalInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, diopiError_t diopiNormalTensorScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t mean, double std, diopiGeneratorHandle_t generator) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atMean = impl::aten::buildATen(mean); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::normal_out(atOut, atMean, std, gen); + CALL_ATEN_CUDA_FUNC(normal_out, atOut, atMean, std, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2648,11 +2829,12 @@ diopiError_t diopiNormalTensorScalar(diopiContextHandle_t ctx, diopiTensorHandle diopiError_t diopiNormalScalarTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out, double mean, diopiConstTensorHandle_t std, diopiGeneratorHandle_t generator) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atStd = impl::aten::buildATen(std); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::normal_out(atOut, mean, atStd, gen); + CALL_ATEN_CUDA_FUNC(normal_out, atOut, mean, atStd, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2660,12 +2842,13 @@ diopiError_t diopiNormalScalarTensor(diopiContextHandle_t ctx, diopiTensorHandle diopiError_t diopiNormalTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t mean, diopiConstTensorHandle_t std, diopiGeneratorHandle_t generator) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atMean = impl::aten::buildATen(mean); auto atStd = impl::aten::buildATen(std); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::normal_out(atOut, atMean, atStd, gen); + CALL_ATEN_CUDA_FUNC(normal_out, atOut, atMean, atStd, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2673,55 +2856,61 @@ diopiError_t diopiNormalTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out diopiError_t diopiMaskedFill(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t mask, diopiConstTensorHandle_t value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMask = impl::aten::buildATen(mask); auto atValue = impl::aten::buildATen(value); auto atOut = impl::aten::buildATen(out); - at::masked_fill_out(atOut, atInput, atMask, atValue); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(masked_fill_out, atOut, atInput, atMask, atValue); return diopiSuccess; } diopiError_t diopiMaskedFillInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t mask, diopiConstTensorHandle_t value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMask = impl::aten::buildATen(mask); auto atValue = impl::aten::buildATen(value); - atInput.masked_fill_(atMask, atValue); + CALL_ATEN_CUDA_FUNC(masked_fill_, atInput, atMask, atValue); return diopiSuccess; } diopiError_t diopiMaskedFillScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t mask, const diopiScalar_t* value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMask = impl::aten::buildATen(mask); auto atValue = impl::aten::buildAtScalar(value); auto atOut = impl::aten::buildATen(out); - at::masked_fill_out(atOut, atInput, atMask, atValue); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(masked_fill_out, atOut, atInput, atMask, atValue); return diopiSuccess; } diopiError_t diopiMaskedFillInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t mask, const diopiScalar_t* value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMask = impl::aten::buildATen(mask); auto atValue = impl::aten::buildAtScalar(value); - atInput.masked_fill_(atMask, atValue); + CALL_ATEN_CUDA_FUNC(masked_fill_, atInput, atMask, atValue); return diopiSuccess; } -diopiError_t diopiMeshGrid(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputsNum) { +diopiError_t diopiMeshGrid(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputsNum) { impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(outs); DIOPI_CHECK_PTR(inputs); auto outsNum = inputsNum; - auto atInputs = impl::aten::buildATenList(inputs, inputsNum); - auto atOuts = impl::aten::buildATenList(outs, outsNum); + auto atInputs = impl::aten::buildATenListSlow(inputs, inputsNum); + auto atOuts = impl::aten::buildATenListSlow(outs, outsNum); atOuts = at::meshgrid(atInputs); for (int i = 0; i < outsNum; ++i) { impl::aten::updateATen2Tensor(ctx, atOuts[i].contiguous(), outs[i]); @@ -2734,11 +2923,11 @@ diopiError_t diopiAdamW(diopiContextHandle_t ctx, diopiTensorHandle_t param, dio diopiTensorHandle_t exp_avg_sq, diopiTensorHandle_t max_exp_avg_sq, float lr, float beta1, float beta2, float eps, float weight_decay, int64_t step, bool amsgrad) { impl::aten::setCurStream(ctx); - auto atParam = impl::aten::buildATen(param); - auto atGrad = impl::aten::buildATen(grad); - auto atExpAvg = impl::aten::buildATen(exp_avg); - auto atExpAvgSq = impl::aten::buildATen(exp_avg_sq); - auto atMaxExpAvgSq = impl::aten::buildATen(max_exp_avg_sq); + auto atParam = impl::aten::buildATenSlow(param); + auto atGrad = impl::aten::buildATenSlow(grad); + auto atExpAvg = impl::aten::buildATenSlow(exp_avg); + auto atExpAvgSq = impl::aten::buildATenSlow(exp_avg_sq); + auto atMaxExpAvgSq = impl::aten::buildATenSlow(max_exp_avg_sq); atParam.mul_(1 - lr * weight_decay); atExpAvg.mul_(beta1).add_(atGrad, 1 - beta1); @@ -2748,7 +2937,7 @@ diopiError_t diopiAdamW(diopiContextHandle_t ctx, diopiTensorHandle_t param, dio auto bias_correction1 = 1 - pow(beta1, step); auto bias_correction2 = 1 - pow(beta2, step); if (amsgrad) { - at::maximum_out(atMaxExpAvgSq, atMaxExpAvgSq, atExpAvgSq); + CALL_ATEN_CUDA_FUNC(maximum_out, atMaxExpAvgSq, atMaxExpAvgSq, atExpAvgSq); denom = atMaxExpAvgSq.sqrt().div_(sqrt(bias_correction2)).add_(eps); } else { denom = atExpAvgSq.sqrt().div_(sqrt(bias_correction2)).add_(eps); @@ -2763,11 +2952,11 @@ diopiError_t diopiAdam(diopiContextHandle_t ctx, diopiTensorHandle_t param, diop diopiTensorHandle_t exp_avg_sq, diopiTensorHandle_t max_exp_avg_sq, float lr, float beta1, float beta2, float eps, float weight_decay, int64_t step, bool amsgrad) { impl::aten::setCurStream(ctx); - auto atParam = impl::aten::buildATen(param); - auto atGrad = impl::aten::buildATen(grad); - auto atExpAvg = impl::aten::buildATen(exp_avg); - auto atExpAvgSq = impl::aten::buildATen(exp_avg_sq); - auto atMaxExpAvgSq = impl::aten::buildATen(max_exp_avg_sq); + auto atParam = impl::aten::buildATenSlow(param); + auto atGrad = impl::aten::buildATenSlow(grad); + auto atExpAvg = impl::aten::buildATenSlow(exp_avg); + auto atExpAvgSq = impl::aten::buildATenSlow(exp_avg_sq); + auto atMaxExpAvgSq = impl::aten::buildATenSlow(max_exp_avg_sq); auto grad_d = atGrad.data(); if (weight_decay != 0) { @@ -2780,7 +2969,7 @@ diopiError_t diopiAdam(diopiContextHandle_t ctx, diopiTensorHandle_t param, diop auto bias_correction1 = 1 - pow(beta1, step); auto bias_correction2 = 1 - pow(beta2, step); if (amsgrad) { - at::maximum_out(atMaxExpAvgSq, atMaxExpAvgSq, atExpAvgSq); + CALL_ATEN_CUDA_FUNC(maximum_out, atMaxExpAvgSq, atMaxExpAvgSq, atExpAvgSq); denom = atMaxExpAvgSq.sqrt().div_(sqrt(bias_correction2)).add_(eps); } else { denom = atExpAvgSq.sqrt().div_(sqrt(bias_correction2)).add_(eps); @@ -2794,10 +2983,10 @@ diopiError_t diopiAdam(diopiContextHandle_t ctx, diopiTensorHandle_t param, diop diopiError_t diopiAdadelta(diopiContextHandle_t ctx, diopiTensorHandle_t param, diopiConstTensorHandle_t grad, diopiTensorHandle_t square_avg, diopiTensorHandle_t acc_delta, float lr, float rho, float eps, float weight_decay) { impl::aten::setCurStream(ctx); - auto atParam = impl::aten::buildATen(param); - auto atGrad = impl::aten::buildATen(grad); - auto atSquareAvg = impl::aten::buildATen(square_avg); - auto atAccDelta = impl::aten::buildATen(acc_delta); + auto atParam = impl::aten::buildATenSlow(param); + auto atGrad = impl::aten::buildATenSlow(grad); + auto atSquareAvg = impl::aten::buildATenSlow(square_avg); + auto atAccDelta = impl::aten::buildATenSlow(acc_delta); auto& param_ = atParam; auto grad_d = atGrad.data(); @@ -2817,11 +3006,11 @@ diopiError_t diopiRmsprop(diopiContextHandle_t ctx, diopiTensorHandle_t param, d diopiTensorHandle_t grad_avg, diopiTensorHandle_t momentum_buf, float lr, float alpha, float eps, float weight_decay, float momentum, bool centered) { impl::aten::setCurStream(ctx); - auto atParam = impl::aten::buildATen(param); - auto atGrad = impl::aten::buildATen(grad); - auto atSquareAvg = impl::aten::buildATen(square_avg); - auto atGradAvg = impl::aten::buildATen(grad_avg); - auto atBuf = impl::aten::buildATen(momentum_buf); + auto atParam = impl::aten::buildATenSlow(param); + auto atGrad = impl::aten::buildATenSlow(grad); + auto atSquareAvg = impl::aten::buildATenSlow(square_avg); + auto atGradAvg = impl::aten::buildATenSlow(grad_avg); + auto atBuf = impl::aten::buildATenSlow(momentum_buf); if (weight_decay != 0) { atGrad = atGrad.add(atParam, weight_decay); @@ -2850,9 +3039,9 @@ diopiError_t diopiConvTranspose2d(diopiContextHandle_t ctx, diopiTensorHandle_t diopiConstTensorHandle_t bias, diopiSize_t stride, diopiSize_t padding, diopiSize_t output_padding, int64_t groups, diopiSize_t dilation) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); - auto atWeight = impl::aten::buildATen(weight); - auto atBias = impl::aten::buildATen(bias); + auto atInput = impl::aten::buildATenSlow(input); + auto atWeight = impl::aten::buildATenSlow(weight); + auto atBias = impl::aten::buildATenSlow(bias); auto atStride = impl::aten::buildAtIntArray(stride); auto atPadding = impl::aten::buildAtIntArray(padding); auto atOutputPadding = impl::aten::buildAtIntArray(output_padding); @@ -2863,10 +3052,11 @@ diopiError_t diopiConvTranspose2d(diopiContextHandle_t ctx, diopiTensorHandle_t } diopiError_t diopiCumsum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::cumsum_out(atOut, atInput, dim); + CALL_ATEN_CUDA_FUNC(cumsum_out, atOut, atInput, dim); return diopiSuccess; } @@ -2874,8 +3064,8 @@ diopiError_t diopiCumsum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiCdist(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input1, diopiConstTensorHandle_t input2, double p, const int64_t* compute_mode) { impl::aten::setCurStream(ctx); - auto atInput1 = impl::aten::buildATen(input1); - auto atInput2 = impl::aten::buildATen(input2); + auto atInput1 = impl::aten::buildATenSlow(input1); + auto atInput2 = impl::aten::buildATenSlow(input2); c10::optional atComputMode = compute_mode ? c10::optional(*compute_mode) : c10::nullopt; impl::aten::invokeATenFuncRet(ctx, at::cdist, out, atInput1, atInput2, p, atComputMode); @@ -2884,74 +3074,81 @@ diopiError_t diopiCdist(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopi diopiError_t diopiCdistBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input1, diopiConstTensorHandle_t input2, double p, diopiConstTensorHandle_t cdist) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput1 = impl::aten::buildATen(input1); auto atInput2 = impl::aten::buildATen(input2); auto atCdist = impl::aten::buildATen(cdist); - at::_cdist_backward_out(atGradInput, atGradOutput, atInput1, atInput2, p, atCdist); + CALL_ATEN_FUNC(_cdist_backward_out, atGradInput, atGradOutput, atInput1, atInput2, p, atCdist); return diopiSuccess; } diopiError_t diopiReciprocal(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::reciprocal_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(reciprocal_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiReciprocalInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); at::reciprocal_(atInput); return diopiSuccess; } diopiError_t diopiBitwiseNot(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::bitwise_not_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(bitwise_not_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiBitwiseNotInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - atInput.bitwise_not_(); + CALL_ATEN_CUDA_FUNC(bitwise_not_, atInput); return diopiSuccess; } diopiError_t diopiLogicalNot(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::logical_not_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(logical_not_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiLogicalNotInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - atInput.logical_not_(); + CALL_ATEN_CUDA_FUNC(logical_not_out, atInput, atInput); return diopiSuccess; } diopiError_t diopiArgmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const int64_t* dim, bool keepdim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); c10::optional atDim = dim ? c10::optional(*dim) : c10::nullopt; - at::argmax_out(atOut, atInput, atDim, keepdim); + CALL_ATEN_CUDA_FUNC(argmax_out, atOut, atInput, atDim, keepdim); return diopiSuccess; } @@ -2959,11 +3156,11 @@ diopiError_t diopiArgmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiSmoothL1Loss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiReduction_t reduction, double beta) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atTarget = impl::aten::buildATen(target); + at::Tensor atInput = impl::aten::buildATenSlow(input); + at::Tensor atTarget = impl::aten::buildATenSlow(target); if (reduction == 0) { - at::Tensor atOut = impl::aten::buildATen(out); - at::smooth_l1_loss_out(atOut, atInput, atTarget, reduction, beta); + at::Tensor atOut = impl::aten::buildATenSlow(out); + CALL_ATEN_CUDA_FUNC(smooth_l1_loss_out, atOut, atInput, atTarget, reduction, beta); } else { impl::aten::invokeATenFuncRet(ctx, at::smooth_l1_loss, out, atInput, atTarget, reduction, beta); } @@ -2973,48 +3170,53 @@ diopiError_t diopiSmoothL1Loss(diopiContextHandle_t ctx, diopiTensorHandle_t out diopiError_t diopiSmoothL1LossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiReduction_t reduction, double beta) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atTarget = impl::aten::buildATen(target); auto atGradInput = impl::aten::buildATen(grad_input); - at::smooth_l1_loss_backward_out(atGradInput, atGradOutput, atInput, atTarget, reduction, beta); + CALL_ATEN_CUDA_FUNC(smooth_l1_loss_backward_out, atGradInput, atGradOutput, atInput, atTarget, reduction, beta); return diopiSuccess; } diopiError_t diopiMaximum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); auto atOut = impl::aten::buildATen(out); - at::maximum_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(maximum_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiMinimum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); auto atOut = impl::aten::buildATen(out); - at::minimum_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(minimum_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiMm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t mat2) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMat2 = impl::aten::buildATen(mat2); auto atOut = impl::aten::buildATen(out); - at::mm_out(atOut, atInput, atMat2); + CALL_ATEN_CUDA_FUNC(mm_out, atOut, atInput, atMat2); return diopiSuccess; } diopiError_t diopiConvolution3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, int64_t groups) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); @@ -3023,7 +3225,7 @@ diopiError_t diopiConvolution3d(diopiContextHandle_t ctx, diopiTensorHandle_t ou auto atStride = impl::aten::buildAtIntArray(stride); auto atPadding = impl::aten::buildAtIntArray(padding); auto atDilation = impl::aten::buildAtIntArray(dilation); - at::convolution_out(atOut, atInput, atWeight, atBias, atStride, atPadding, atDilation, false, at::IntArrayRef(0), groups); + CALL_ATEN_FUNC(convolution_out, atOut, atInput, atWeight, atBias, atStride, atPadding, atDilation, false, at::IntArrayRef(0), groups); return diopiSuccess; } @@ -3033,9 +3235,9 @@ diopiError_t diopiConvolution3dBackward(diopiContextHandle_t ctx, diopiTensorHan diopiConstTensorHandle_t weight, diopiSize_t* bias_sizes, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, int64_t groups) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); - auto atGrad = impl::aten::buildATen(grad_output); - auto atWeight = impl::aten::buildATen(weight); + auto atInput = impl::aten::buildATenSlow(input); + auto atGrad = impl::aten::buildATenSlow(grad_output); + auto atWeight = impl::aten::buildATenSlow(weight); auto atStride = impl::aten::buildAtIntArray(stride); auto atPadding = impl::aten::buildAtIntArray(padding); std::vector outputPadding(padding.len, 0); @@ -3047,7 +3249,7 @@ diopiError_t diopiConvolution3dBackward(diopiContextHandle_t ctx, diopiTensorHan impl::aten::invokeATenFuncRet( ctx, at::miopen_convolution_backward, vecOut, atInput, atGrad, atWeight, atPadding, atStride, atDilation, groups, false, false, grad_input_mask); if (bias_sizes != nullptr && grad_bias != nullptr) { - auto atBias = impl::aten::buildATen(grad_bias); + auto atBias = impl::aten::buildATenSlow(grad_bias); at::Tensor atTmp = atGrad; int64_t size = atGrad.dim() - 1; while (atBias.dim() != size) { @@ -3062,9 +3264,9 @@ diopiError_t diopiConvolution3dBackward(diopiContextHandle_t ctx, diopiTensorHan #else if (grad_input && grad_weight && grad_bias && bias_sizes) { auto atBiasSizes = impl::aten::buildAtIntArray(bias_sizes); - auto atGradInput = impl::aten::buildATen(grad_input); - auto atGradWeight = impl::aten::buildATen(grad_weight); - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradInput = impl::aten::buildATenSlow(grad_input); + auto atGradWeight = impl::aten::buildATenSlow(grad_weight); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::convolution_backward_out(atGradInput, atGradWeight, atGradBias, @@ -3086,14 +3288,14 @@ diopiError_t diopiConvolution3dBackward(diopiContextHandle_t ctx, diopiTensorHan impl::aten::updateATen2Tensor(ctx, std::get<1>(grad_inputs), grad_weight); if (bias_sizes != nullptr && grad_bias != nullptr) { - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::Tensor atTmp = atGrad; int64_t size = atGrad.dim() - 1; while (atGradBias.dim() != size) { atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } } #endif @@ -3103,7 +3305,7 @@ diopiError_t diopiConvolution3dBackward(diopiContextHandle_t ctx, diopiTensorHan diopiError_t diopiExpand(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); diopiSize_t size; diopiGetTensorShape(out, &size); auto atSize = impl::aten::buildAtIntArray(size); @@ -3115,7 +3317,7 @@ diopiError_t diopiExpand(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiUnfold(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim, int64_t size, int64_t step) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); // must use contiguous rather than clone in this case auto atOut = at::native::unfold(atInput, dim, size, step).contiguous(); impl::aten::updateATen2Tensor(ctx, atOut, out); @@ -3125,11 +3327,13 @@ diopiError_t diopiUnfold(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiUnfoldBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiSize_t input_sizes, int64_t dim, int64_t size, int64_t step) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradInput = impl::aten::buildATen(grad_input); auto atGrad = impl::aten::buildATen(grad_output); auto atInputSize = impl::aten::buildAtIntArray(input_sizes); - at::unfold_backward_out(atGradInput, atGrad, atInputSize, dim, size, step); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(unfold_backward_out, atGradInput, atGrad, atInputSize, dim, size, step); return diopiSuccess; } @@ -3137,8 +3341,8 @@ diopiError_t diopiUnfoldBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g diopiError_t diopiMaskedSelect(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t mask) { impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(out); - auto atInput = impl::aten::buildATen(input); - auto atMask = impl::aten::buildATen(mask); + auto atInput = impl::aten::buildATenSlow(input); + auto atMask = impl::aten::buildATenSlow(mask); auto atOut = at::masked_select(atInput, atMask); impl::aten::buildDiopiTensor(ctx, atOut, out); @@ -3148,9 +3352,9 @@ diopiError_t diopiMaskedSelect(diopiContextHandle_t ctx, diopiTensorHandle_t* ou diopiError_t diopiMaskedSelectBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t mask) { impl::aten::setCurStream(ctx); - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); - auto atMask = impl::aten::buildATen(mask); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); + auto atMask = impl::aten::buildATenSlow(mask); impl::aten::invokeATenFuncRet(ctx, at::masked_select_backward, grad_input, atGradOutput, atInput, atMask); return diopiSuccess; @@ -3158,73 +3362,83 @@ diopiError_t diopiMaskedSelectBackward(diopiContextHandle_t ctx, diopiTensorHand diopiError_t diopiIndexFillScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim, diopiConstTensorHandle_t index, const diopiScalar_t* value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atValue = impl::aten::buildAtScalar(value); auto atOut = impl::aten::buildATen(out); - at::index_fill_out(atOut, atInput, dim, atIndex, atValue); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(index_fill_out, atOut, atInput, dim, atIndex, atValue); return diopiSuccess; } diopiError_t diopiIndexFill(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim, diopiConstTensorHandle_t index, diopiConstTensorHandle_t value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atValue = impl::aten::buildATen(value); auto atOut = impl::aten::buildATen(out); - at::index_fill_out(atOut, atInput, dim, atIndex, atValue); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(index_fill_out, atOut, atInput, dim, atIndex, atValue); return diopiSuccess; } diopiError_t diopiIndexFillInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, int64_t dim, diopiConstTensorHandle_t index, const diopiScalar_t* value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atValue = impl::aten::buildAtScalar(value); - atInput.index_fill_(dim, atIndex, atValue); + CALL_ATEN_CUDA_FUNC(index_fill_, atInput, dim, atIndex, atValue); return diopiSuccess; } diopiError_t diopiIndexFillInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, int64_t dim, diopiConstTensorHandle_t index, diopiConstTensorHandle_t value) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atValue = impl::aten::buildATen(value); - atInput.index_fill_(dim, atIndex, atValue); + CALL_ATEN_CUDA_FUNC(index_fill_, atInput, dim, atIndex, atValue); return diopiSuccess; } diopiError_t diopiLinspace(diopiContextHandle_t ctx, diopiTensorHandle_t out, const diopiScalar_t* start, const diopiScalar_t* end, int64_t steps) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atStart = impl::aten::buildAtScalar(start); auto atEnd = impl::aten::buildAtScalar(end); c10::optional atStep(steps); at::Tensor atOut = impl::aten::buildATen(out); - at::linspace_out(atOut, atStart, atEnd, steps); + CALL_ATEN_CUDA_FUNC(linspace_out, atOut, atStart, atEnd, steps); return diopiSuccess; } diopiError_t diopiRoll(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t shifts, diopiSize_t dims) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); at::IntArrayRef atShifts = impl::aten::buildAtIntArray(shifts); at::IntArrayRef atDims = impl::aten::buildAtIntArray(dims); auto atOut = impl::aten::buildATen(out); - at::roll_out(atOut, atInput, atShifts, atDims); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(roll_out, atOut, atInput, atShifts, atDims); return diopiSuccess; } diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* p, diopiSize_t dim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); @@ -3234,13 +3448,14 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::norm_out(atOut, atInput, atP, atDim, keepdim); + CALL_ATEN_CUDA_FUNC(norm_out, atOut, atInput, atP, atDim, keepdim); return diopiSuccess; } diopiError_t diopiGroupNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t save_mean, diopiTensorHandle_t save_invstd, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, int64_t num_groups, double eps) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atWeight = impl::aten::buildATen(weight); @@ -3252,7 +3467,8 @@ diopiError_t diopiGroupNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, d const int64_t C = atInput.size(1); const auto input_shape = atInput.sizes(); const int64_t HxW = c10::multiply_integers(input_shape.cbegin() + 2, input_shape.cend()); - at::native_group_norm_out(atOut, atSaveMean, atSaveInvstd, atInput, atWeight, atBias, N, C, HxW, num_groups, eps); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(native_group_norm_out, atOut, atSaveMean, atSaveInvstd, atInput, atWeight, atBias, N, C, HxW, num_groups, eps); return diopiSuccess; } @@ -3261,19 +3477,19 @@ diopiError_t diopiGroupNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t mean, diopiConstTensorHandle_t rstd, int64_t num_groups) { impl::aten::setCurStream(ctx); - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); - auto atWeight = impl::aten::buildATen(weight); - auto atSaveMean = impl::aten::buildATen(mean); - auto atSaveVar = impl::aten::buildATen(rstd); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); + auto atWeight = impl::aten::buildATenSlow(weight); + auto atSaveMean = impl::aten::buildATenSlow(mean); + auto atSaveVar = impl::aten::buildATenSlow(rstd); const int64_t N = atInput.size(0); const int64_t C = atInput.size(1); const auto input_shape = atInput.sizes(); const int64_t HxW = c10::multiply_integers(input_shape.cbegin() + 2, input_shape.cend()); if (grad_weight && grad_bias) { - auto atGradInput = impl::aten::buildATen(grad_input); - auto atGradWeight = impl::aten::buildATen(grad_weight); - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradInput = impl::aten::buildATenSlow(grad_input); + auto atGradWeight = impl::aten::buildATenSlow(grad_weight); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::native_group_norm_backward_out( atGradInput, atGradWeight, atGradBias, atGradOutput, atInput, atSaveMean, atSaveVar, atWeight, N, C, HxW, num_groups, {true, true, true}); } else { @@ -3290,12 +3506,12 @@ diopiError_t diopiGroupNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiBCELoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atTarget = impl::aten::buildATen(target); - c10::optional atWeight = weight ? c10::optional(impl::aten::buildATen(weight)) : c10::nullopt; + at::Tensor atOut = impl::aten::buildATenSlow(out); + at::Tensor atInput = impl::aten::buildATenSlow(input); + at::Tensor atTarget = impl::aten::buildATenSlow(target); + c10::optional atWeight = weight ? c10::optional(impl::aten::buildATenSlow(weight)) : c10::nullopt; if (reduction == 0) { - at::binary_cross_entropy_out(atOut, atInput, atTarget, atWeight, reduction); + CALL_ATEN_CUDA_FUNC(binary_cross_entropy_out, atOut, atInput, atTarget, atWeight, reduction); } else { impl::aten::invokeATenFuncRet(ctx, at::binary_cross_entropy, out, atInput, atTarget, atWeight, reduction); } @@ -3306,13 +3522,14 @@ diopiError_t diopiBCELoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio diopiError_t diopiBCELossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiReduction_t reduction) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atTarget = impl::aten::buildATen(target); c10::optional atWeight = weight ? c10::optional(impl::aten::buildATen(weight)) : c10::nullopt; auto atGradInput = impl::aten::buildATen(grad_input); - at::binary_cross_entropy_backward_out(atGradInput, atGradOutput, atInput, atTarget, atWeight, reduction); + CALL_ATEN_CUDA_FUNC(binary_cross_entropy_backward_out, atGradInput, atGradOutput, atInput, atTarget, atWeight, reduction); return diopiSuccess; } @@ -3321,16 +3538,16 @@ diopiError_t diopiLayerNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiSize_t normalized_shape, double eps) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atSaveMean = impl::aten::buildATen(save_mean); - at::Tensor atSaveInvstd = impl::aten::buildATen(save_invstd); + at::Tensor atOut = impl::aten::buildATenSlow(out); + at::Tensor atSaveMean = impl::aten::buildATenSlow(save_mean); + at::Tensor atSaveInvstd = impl::aten::buildATenSlow(save_invstd); - at::Tensor atInput = impl::aten::buildATen(input); - c10::optional atWeight = weight ? c10::optional(impl::aten::buildATen(weight)) : c10::nullopt; - c10::optional atBias = bias ? c10::optional(impl::aten::buildATen(bias)) : c10::nullopt; + at::Tensor atInput = impl::aten::buildATenSlow(input); + c10::optional atWeight = weight ? c10::optional(impl::aten::buildATenSlow(weight)) : c10::nullopt; + c10::optional atBias = bias ? c10::optional(impl::aten::buildATenSlow(bias)) : c10::nullopt; auto atNormalizedShape = impl::aten::buildAtIntArray(normalized_shape); // TODO(zhaoguochun): check dtype: when input is half, atSaveInvstd, atInput should be float? - // at::native_layer_norm_out(atOut, atSaveMean, atSaveInvstd, atInput, atNormalizedShape, atWeight, atBias, eps); + // CALL_ATEN_CUDA_FUNC(native_layer_norm_out, atOut, atSaveMean, atSaveInvstd, atInput, atNormalizedShape, atWeight, atBias, eps); diopi_tensor_list vecOut = {out, save_mean, save_invstd}; impl::aten::invokeATenFuncRet(ctx, at::native_layer_norm, vecOut, atInput, atNormalizedShape, atWeight, atBias, eps); @@ -3345,36 +3562,36 @@ diopiError_t diopiLayerNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ if (rstd) { diopiGetTensorDtype(rstd, &rDtype); } - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); auto atNormalizedShape = impl::aten::buildAtIntArray(normalized_shape); c10::optional atWeight; c10::optional atBias; auto grad_input_mask = std::array{true, false, false}; if (weight != nullptr) { - atWeight = c10::optional(impl::aten::buildATen(weight)); + atWeight = c10::optional(impl::aten::buildATenSlow(weight)); grad_input_mask.at(1) = true; } if (bias != nullptr) { - atBias = c10::optional(impl::aten::buildATen(bias)); + atBias = c10::optional(impl::aten::buildATenSlow(bias)); grad_input_mask.at(2) = true; } - auto atSaveMean = impl::aten::buildATen(mean); + auto atSaveMean = impl::aten::buildATenSlow(mean); diopiGetTensorDtype(mean, &mDtype); if (diopiDtype_t::diopi_dtype_float16 == mDtype) { atSaveMean = at::native::to(atSaveMean, impl::aten::getATenType(diopiDtype_t::diopi_dtype_float32).toScalarType(), false, true, c10::nullopt); } - auto atSaveVar = impl::aten::buildATen(rstd); + auto atSaveVar = impl::aten::buildATenSlow(rstd); diopiGetTensorDtype(rstd, &rDtype); if (diopiDtype_t::diopi_dtype_float16 == rDtype) { atSaveVar = at::native::to(atSaveVar, impl::aten::getATenType(diopiDtype_t::diopi_dtype_float32).toScalarType(), false, true, c10::nullopt); } if (grad_input && grad_weight && grad_bias) { - auto atGradInput = impl::aten::buildATen(grad_input); - auto atGradWeight = impl::aten::buildATen(grad_weight); - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atGradInput = impl::aten::buildATenSlow(grad_input); + auto atGradWeight = impl::aten::buildATenSlow(grad_weight); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::native_layer_norm_backward_out( atGradInput, atGradWeight, atGradBias, atGradOutput, atInput, atNormalizedShape, atSaveMean, atSaveVar, atWeight, atBias, grad_input_mask); } else { @@ -3394,29 +3611,31 @@ diopiError_t diopiLayerNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ } diopiError_t diopiAdaptiveAvgPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); auto atOut = impl::aten::buildATen(out); - at::adaptive_avg_pool3d_out(atOut, atInput, atOutSize); + CALL_ATEN_CUDA_FUNC(adaptive_avg_pool3d_out, atOut, atInput, atOutSize); return diopiSuccess; } diopiError_t diopiAdaptiveAvgPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atGradInput = impl::aten::buildATen(grad_input); - at::adaptive_avg_pool3d_backward_out(atGradInput, atGradOutput, atInput); + CALL_ATEN_CUDA_FUNC(adaptive_avg_pool3d_backward_out, atGradInput, atGradOutput, atInput); return diopiSuccess; } diopiError_t diopiAdaptiveMaxPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); auto atOuts = at::adaptive_max_pool3d(atInput, atOutSize); impl::aten::updateATen2Tensor(ctx, std::get<0>(atOuts), out); @@ -3426,24 +3645,26 @@ diopiError_t diopiAdaptiveMaxPool3d(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiAdaptiveMaxPool3dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, diopiSize_t output_size) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); auto atOut = impl::aten::buildATen(out); auto atIndices = impl::aten::buildATen(indices); - at::adaptive_max_pool3d_out(atOut, atIndices, atInput, atOutSize); + CALL_ATEN_CUDA_FUNC(adaptive_max_pool3d_out, atOut, atIndices, atInput, atOutSize); return diopiSuccess; } diopiError_t diopiAdaptiveMaxPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t indices) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atGradOutput = impl::aten::buildATen(grad_output); at::Tensor atIndices = impl::aten::buildATen(indices); at::Tensor atGradInput = impl::aten::buildATen(grad_input); - at::adaptive_max_pool3d_backward_out(atGradInput, atGradOutput, atInput, atIndices); + CALL_ATEN_CUDA_FUNC(adaptive_max_pool3d_backward_out, atGradInput, atGradOutput, atInput, atIndices); return diopiSuccess; } @@ -3451,7 +3672,7 @@ diopiError_t diopiAdaptiveMaxPool3dBackward(diopiContextHandle_t ctx, diopiTenso diopiError_t diopiMaxPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); @@ -3464,6 +3685,7 @@ diopiError_t diopiMaxPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiMaxPool3dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); @@ -3473,7 +3695,7 @@ diopiError_t diopiMaxPool3dWithIndices(diopiContextHandle_t ctx, diopiTensorHand bool atCeilMode = ceil_mode; at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atIndices = impl::aten::buildATen(indices); - at::max_pool3d_with_indices_out(atOut, atIndices, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); + CALL_ATEN_CUDA_FUNC(max_pool3d_with_indices_out, atOut, atIndices, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); return diopiSuccess; } @@ -3481,6 +3703,7 @@ diopiError_t diopiMaxPool3dWithIndices(diopiContextHandle_t ctx, diopiTensorHand diopiError_t diopiMaxPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode, diopiConstTensorHandle_t indices) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); @@ -3490,14 +3713,14 @@ diopiError_t diopiMaxPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_ at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); auto atIndices = impl::aten::buildATen(indices); auto atGradInput = impl::aten::buildATen(grad_input); - at::max_pool3d_with_indices_backward_out(atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, atDilation, ceil_mode, atIndices); + CALL_ATEN_CUDA_FUNC(max_pool3d_with_indices_backward_out, atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, atDilation, ceil_mode, atIndices); return diopiSuccess; } diopiError_t diopiPermute(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t dims) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); auto atDims = impl::aten::buildAtIntArray(dims); impl::aten::invokeATenFuncRet(ctx, at::permute, out, atInput, atDims); @@ -3506,8 +3729,8 @@ diopiError_t diopiPermute(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio diopiError_t diopiCopyInp(diopiContextHandle_t ctx, diopiConstTensorHandle_t src, diopiTensorHandle_t dest) { impl::aten::setCurStream(ctx); - at::Tensor atDest = impl::aten::buildATen(dest); - at::Tensor atSrc = impl::aten::buildATen(src); + at::Tensor atDest = impl::aten::buildATenSlow(dest); + at::Tensor atSrc = impl::aten::buildATenSlow(src); // Set non_blocking true to avoid stream sync thus improving performance. // The data is not ready when diopiCopyInp returns. // If you need to use it immediately, please call cudaStreamSynchronize first. @@ -3517,11 +3740,12 @@ diopiError_t diopiCopyInp(diopiContextHandle_t ctx, diopiConstTensorHandle_t src } diopiError_t diopiGather(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim, diopiConstTensorHandle_t index) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atOut = impl::aten::buildATen(out); - at::gather_out(atOut, atInput, dim, atIndex); + CALL_ATEN_CUDA_FUNC(gather_out, atOut, atInput, dim, atIndex); return diopiSuccess; } @@ -3529,9 +3753,9 @@ diopiError_t diopiGather(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiGatherBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, int64_t dim, diopiConstTensorHandle_t index) { impl::aten::setCurStream(ctx); - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); - auto atIndex = impl::aten::buildATen(index); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); + auto atIndex = impl::aten::buildATenSlow(index); bool sparse_grad = false; auto atOut = at::gather_backward(atGradOutput, atInput, dim, atIndex, sparse_grad); impl::aten::updateATen2Tensor(ctx, atOut, grad_input); @@ -3540,31 +3764,34 @@ diopiError_t diopiGatherBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g } diopiError_t diopiRemainderTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); auto atOut = impl::aten::buildATen(out); - at::remainder_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(remainder_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiRemainderScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildAtScalar(other); auto atOut = impl::aten::buildATen(out); - at::remainder_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(remainder_out, atOut, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiRemainder(diopiContextHandle_t ctx, diopiTensorHandle_t out, const diopiScalar_t* input, diopiConstTensorHandle_t other) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInputScalar = impl::aten::buildAtScalar(input); auto atOther = impl::aten::buildATen(other); auto atOut = impl::aten::buildATen(out); - at::remainder_out(atOut, atInputScalar, atOther); + CALL_ATEN_CUDA_FUNC(remainder_out, atOut, c10::scalar_to_tensor(atInputScalar), atOther); return diopiSuccess; } @@ -3572,16 +3799,17 @@ diopiError_t diopiCTCLoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio diopiConstTensorHandle_t log_probs, diopiConstTensorHandle_t targets, diopiConstTensorHandle_t input_lengths, diopiConstTensorHandle_t target_lengths, int64_t blank, diopiReduction_t reduction, bool zero_infinity) { impl::aten::setCurStream(ctx); - auto atLogProbs = impl::aten::buildATen(log_probs); - auto atTarget = impl::aten::buildATen(targets); - auto atInputLength = impl::aten::buildATen(input_lengths).to(at::Device(at::kCPU), at::kLong).contiguous(); - auto atTargetLength = impl::aten::buildATen(target_lengths).to(at::Device(at::kCPU), at::kLong).contiguous(); + auto atLogProbs = impl::aten::buildATenSlow(log_probs); + auto atTarget = impl::aten::buildATenSlow(targets); + auto atInputLength = impl::aten::buildATenSlow(input_lengths).to(at::Device(at::kCPU), at::kLong).contiguous(); + auto atTargetLength = impl::aten::buildATenSlow(target_lengths).to(at::Device(at::kCPU), at::kLong).contiguous(); at::IntArrayRef il(atInputLength.data_ptr(), atInputLength.numel()); at::IntArrayRef tl(atTargetLength.data_ptr(), atTargetLength.numel()); - auto atNegLogLikelihood = impl::aten::buildATen(neg_log_likelihood); - auto atLogAlpha = impl::aten::buildATen(log_alpha); - at::_ctc_loss_out(atNegLogLikelihood, atLogAlpha, atLogProbs, atTarget, il, tl, blank, zero_infinity); + auto atNegLogLikelihood = impl::aten::buildATenSlow(neg_log_likelihood); + auto atLogAlpha = impl::aten::buildATenSlow(log_alpha); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(_ctc_loss_out, atNegLogLikelihood, atLogAlpha, atLogProbs, atTarget, il, tl, blank, zero_infinity); auto atRes = atNegLogLikelihood; if (zero_infinity) { atRes = at::where(atRes == at::Scalar(std::numeric_limits::infinity()), at::zeros({}, atRes.options()), atRes); @@ -3602,16 +3830,16 @@ diopiError_t diopiCTCLossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t diopiConstTensorHandle_t target_lengths, diopiConstTensorHandle_t neg_log_likelihood, diopiConstTensorHandle_t log_alpha, int64_t blank, diopiReduction_t reduction, bool zero_infinity) { impl::aten::setCurStream(ctx); - auto atLogProbs = impl::aten::buildATen(log_probs); - auto atTarget = impl::aten::buildATen(targets); - auto atInputLength = impl::aten::buildATen(input_lengths).to(at::Device(at::kCPU), at::kLong).contiguous(); - auto atTargetLength = impl::aten::buildATen(target_lengths).to(at::Device(at::kCPU), at::kLong).contiguous(); + auto atLogProbs = impl::aten::buildATenSlow(log_probs); + auto atTarget = impl::aten::buildATenSlow(targets); + auto atInputLength = impl::aten::buildATenSlow(input_lengths).to(at::Device(at::kCPU), at::kLong).contiguous(); + auto atTargetLength = impl::aten::buildATenSlow(target_lengths).to(at::Device(at::kCPU), at::kLong).contiguous(); at::IntArrayRef il(atInputLength.data_ptr(), atInputLength.numel()); at::IntArrayRef tl(atTargetLength.data_ptr(), atTargetLength.numel()); int64_t batch_size = atLogProbs.size(1); std::vector expand_shape = {batch_size}; at::IntArrayRef shape(expand_shape.data(), expand_shape.size()); - auto atGrad = impl::aten::buildATen(grad_output); + auto atGrad = impl::aten::buildATenSlow(grad_output); if (reduction == 1) { atGrad = at::native::expand(atGrad, shape).clone(); auto target_lengths_t = at::tensor(tl, atGrad.options()).clamp_min(1); @@ -3620,16 +3848,18 @@ diopiError_t diopiCTCLossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t } else if (reduction == 2) { atGrad = at::native::expand(atGrad, shape); } - auto atNegLogLikehood = impl::aten::buildATen(neg_log_likelihood); - auto atLogAlpha = impl::aten::buildATen(log_alpha); - auto atGradInput = impl::aten::buildATen(grad_input); - at::_ctc_loss_backward_out(atGradInput, atGrad, atLogProbs, atTarget, il, tl, atNegLogLikehood, atLogAlpha, blank, zero_infinity); + auto atNegLogLikehood = impl::aten::buildATenSlow(neg_log_likelihood); + auto atLogAlpha = impl::aten::buildATenSlow(log_alpha); + auto atGradInput = impl::aten::buildATenSlow(grad_input); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(_ctc_loss_backward_out, atGradInput, atGrad, atLogProbs, atTarget, il, tl, atNegLogLikehood, atLogAlpha, blank, zero_infinity); return diopiSuccess; } diopiError_t diopiIndexPutInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t values, diopiConstTensorHandle_t* indices, int64_t indices_counts, bool accumulate) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(indices); auto atInput = impl::aten::buildATen(input); @@ -3640,13 +3870,15 @@ diopiError_t diopiIndexPutInp(diopiContextHandle_t ctx, diopiTensorHandle_t inpu auto atIndices = c10::optional(impl::aten::buildATen(indices[i])); atIndicesList.emplace_back(atIndices); } - atInput.index_put_(atIndicesList, atValues, accumulate); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(index_put_, atInput, atIndicesList, atValues, accumulate); return diopiSuccess; } DIOPI_API diopiError_t diopiIndexPut(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t values, diopiConstTensorHandle_t* indices, int64_t indices_counts, bool accumulate) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(indices); auto atInput = impl::aten::buildATen(input); @@ -3658,13 +3890,15 @@ DIOPI_API diopiError_t diopiIndexPut(diopiContextHandle_t ctx, diopiTensorHandle auto atIndices = c10::optional(impl::aten::buildATen(indices[i])); atIndicesList.emplace_back(atIndices); } - at::index_put_out(atOut, atInput, atIndicesList, atValues, accumulate); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(index_put_out, atOut, atInput, atIndicesList, atValues, accumulate); return diopiSuccess; } diopiError_t diopiScatterInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, int64_t dim, diopiConstTensorHandle_t src, diopiConstTensorHandle_t index, const char* reduce) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atSrc = impl::aten::buildATen(src); @@ -3675,9 +3909,9 @@ diopiError_t diopiScatterInp(diopiContextHandle_t ctx, diopiTensorHandle_t input at::Tensor atOut; if (0 == strcmp(reduce, "add") || 0 == strcmp(reduce, "multiply")) { c10::string_view atReduce(reduce, strlen(reduce)); - atInput.scatter_(dim, atIndex, atSrc, atReduce); + CALL_ATEN_CUDA_FUNC(scatter_, atInput, dim, atIndex, atSrc, atReduce); } else { - atInput.scatter_(dim, atIndex, atSrc); + CALL_ATEN_CUDA_FUNC(scatter_, atInput, dim, atIndex, atSrc); } return diopiSuccess; @@ -3685,6 +3919,7 @@ diopiError_t diopiScatterInp(diopiContextHandle_t ctx, diopiTensorHandle_t input diopiError_t diopiScatterInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, int64_t dim, const diopiScalar_t* value, diopiConstTensorHandle_t index, const char* reduce) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atValue = impl::aten::buildAtScalar(value); @@ -3695,9 +3930,9 @@ diopiError_t diopiScatterInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t at::Tensor atOut; if (0 == strcmp(reduce, "add") || 0 == strcmp(reduce, "multiply")) { c10::string_view atReduce(reduce, strlen(reduce)); - atInput.scatter_(dim, atIndex, atValue, atReduce); + CALL_ATEN_CUDA_FUNC(scatter_, atInput, dim, atIndex, atValue, atReduce); } else { - atInput.scatter_(dim, atIndex, atValue); + CALL_ATEN_CUDA_FUNC(scatter_, atInput, dim, atIndex, atValue); } return diopiSuccess; @@ -3705,6 +3940,7 @@ diopiError_t diopiScatterInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t diopiError_t diopiScatter(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim, diopiConstTensorHandle_t src, diopiConstTensorHandle_t index, const char* reduce) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atSrc = impl::aten::buildATen(src); @@ -3716,9 +3952,9 @@ diopiError_t diopiScatter(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio } if (0 == strcmp(reduce, "add") || 0 == strcmp(reduce, "multiply")) { c10::string_view atReduce(reduce, strlen(reduce)); - at::scatter_out(atOut, atInput, dim, atIndex, atSrc, atReduce); + CALL_ATEN_CUDA_FUNC(scatter_out, atOut, atInput, dim, atIndex, atSrc, atReduce); } else { - at::scatter_out(atOut, atInput, dim, atIndex, atSrc); + CALL_ATEN_CUDA_FUNC(scatter_out, atOut, atInput, dim, atIndex, atSrc); } return diopiSuccess; @@ -3726,6 +3962,7 @@ diopiError_t diopiScatter(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio diopiError_t diopiScatterScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim, const diopiScalar_t* value, diopiConstTensorHandle_t index, const char* reduce) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atValue = impl::aten::buildAtScalar(value); @@ -3737,25 +3974,26 @@ diopiError_t diopiScatterScalar(diopiContextHandle_t ctx, diopiTensorHandle_t ou } if (0 == strcmp(reduce, "add") || 0 == strcmp(reduce, "multiply")) { c10::string_view atReduce(reduce, strlen(reduce)); - at::scatter_out(atOut, atInput, dim, atIndex, atValue, atReduce); + CALL_ATEN_CUDA_FUNC(scatter_out, atOut, atInput, dim, atIndex, atValue, atReduce); } else { - at::scatter_out(atOut, atInput, dim, atIndex, atValue); + CALL_ATEN_CUDA_FUNC(scatter_out, atOut, atInput, dim, atIndex, atValue); } return diopiSuccess; } diopiError_t diopiUpsampleNearest(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t size) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); at::IntArrayRef atSize = impl::aten::buildAtIntArray(size); if (atInput.dim() == 3) { - at::upsample_nearest1d_out(atOut, atInput, atSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest1d_out, atOut, atInput, atSize); } else if (atInput.dim() == 4) { - at::upsample_nearest2d_out(atOut, atInput, atSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest2d_out, atOut, atInput, atSize); } else if (atInput.dim() == 5) { - at::upsample_nearest3d_out(atOut, atInput, atSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest3d_out, atOut, atInput, atSize); } else { NOT_SUPPORTED("input dim < 3 or >5"); return diopiErrorOccurred; @@ -3766,17 +4004,18 @@ diopiError_t diopiUpsampleNearest(diopiContextHandle_t ctx, diopiTensorHandle_t diopiError_t diopiUpsampleNearestBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiSize_t out_size, diopiSize_t in_size) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atGradOut = impl::aten::buildATen(grad_output); at::Tensor atGradInput = impl::aten::buildATen(grad_input); at::IntArrayRef atOutSize = impl::aten::buildAtIntArray(out_size); at::IntArrayRef atInSize = impl::aten::buildAtIntArray(in_size); if (atGradInput.dim() == 3) { - at::upsample_nearest1d_backward_out(atGradInput, atGradOut, atOutSize, atInSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest1d_backward_out, atGradInput, atGradOut, atOutSize, atInSize); } else if (atGradInput.dim() == 4) { - at::upsample_nearest2d_backward_out(atGradInput, atGradOut, atOutSize, atInSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest2d_backward_out, atGradInput, atGradOut, atOutSize, atInSize); } else if (atGradInput.dim() == 5) { - at::upsample_nearest3d_backward_out(atGradInput, atGradOut, atOutSize, atInSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest3d_backward_out, atGradInput, atGradOut, atOutSize, atInSize); } else { NOT_SUPPORTED("grad_input dim < 3 or >5"); return diopiErrorOccurred; @@ -3787,23 +4026,24 @@ diopiError_t diopiUpsampleNearestBackward(diopiContextHandle_t ctx, diopiTensorH diopiError_t diopiUpsampleLinear(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t size, bool align_corners, const char* mode) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); at::IntArrayRef atSize = impl::aten::buildAtIntArray(size); if (3 == atInput.dim() && 0 == strcmp(mode, "linear")) { - at::upsample_linear1d_out(atOut, atInput, atSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_linear1d_out, atOut, atInput, atSize, align_corners); } else if (4 == atInput.dim()) { if (0 == strcmp(mode, "bilinear")) { - at::upsample_bilinear2d_out(atOut, atInput, atSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_bilinear2d_out, atOut, atInput, atSize, align_corners); } else if (0 == strcmp(mode, "bicubic")) { - at::upsample_bicubic2d_out(atOut, atInput, atSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_bicubic2d_out, atOut, atInput, atSize, align_corners); } else { NOT_SUPPORTED("interpolate mode type"); return diopiErrorOccurred; } } else if (5 == atInput.dim() && 0 == strcmp(mode, "trilinear")) { - at::upsample_trilinear3d_out(atOut, atInput, atSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_trilinear3d_out, atOut, atInput, atSize, align_corners); } else { NOT_SUPPORTED("interpolate mode type"); return diopiErrorOccurred; @@ -3814,24 +4054,25 @@ diopiError_t diopiUpsampleLinear(diopiContextHandle_t ctx, diopiTensorHandle_t o diopiError_t diopiUpsampleLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiSize_t out_size, diopiSize_t in_size, bool align_corners, const char* mode) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atGradOut = impl::aten::buildATen(grad_output); at::Tensor atGradInput = impl::aten::buildATen(grad_input); at::IntArrayRef atOutSize = impl::aten::buildAtIntArray(out_size); at::IntArrayRef atInSize = impl::aten::buildAtIntArray(in_size); if (3 == atGradInput.dim() && 0 == strcmp(mode, "linear")) { - at::upsample_linear1d_backward_out(atGradInput, atGradOut, atOutSize, atInSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_linear1d_backward_out, atGradInput, atGradOut, atOutSize, atInSize, align_corners); } else if (4 == atGradInput.dim()) { if (0 == strcmp(mode, "bilinear")) { - at::upsample_bilinear2d_backward_out(atGradInput, atGradOut, atOutSize, atInSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_bilinear2d_backward_out, atGradInput, atGradOut, atOutSize, atInSize, align_corners); } else if (0 == strcmp(mode, "bicubic")) { - at::upsample_bicubic2d_backward_out(atGradInput, atGradOut, atOutSize, atInSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_bicubic2d_backward_out, atGradInput, atGradOut, atOutSize, atInSize, align_corners); } else { NOT_SUPPORTED("interpolate mode type"); return diopiErrorOccurred; } } else if (5 == atGradInput.dim() && 0 == strcmp(mode, "trilinear")) { - at::upsample_trilinear3d_backward_out(atGradInput, atGradOut, atOutSize, atInSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_trilinear3d_backward_out, atGradInput, atGradOut, atOutSize, atInSize, align_corners); } else { NOT_SUPPORTED("interpolate mode type"); return diopiErrorOccurred; @@ -3843,7 +4084,7 @@ diopiError_t diopiUpsampleLinearBackward(diopiContextHandle_t ctx, diopiTensorHa diopiError_t diopiPad(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t pad, const char* mode, const double* value) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); auto atPad = impl::aten::buildAtIntArray(pad); torch::nn::functional::PadFuncOptions::mode_t pad_mode; double atValue = 0; @@ -3868,9 +4109,10 @@ diopiError_t diopiPad(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo diopiError_t diopiUnique(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t input, const int64_t* dim, bool sorted, bool return_counts, diopiTensorHandle_t indices, diopiTensorHandle_t* counts) { + impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(out); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); bool return_inverse = indices ? true : false; std::tuple atOuts; @@ -3893,8 +4135,8 @@ diopiError_t diopiUnique(diopiContextHandle_t ctx, diopiTensorHandle_t* out, dio diopiError_t diopiProd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const int64_t* dim) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); - auto atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATenSlow(input); + auto atOut = impl::aten::buildATenSlow(out); if (dim == nullptr) { auto atTmp = at::prod(atInput); impl::aten::updateATen2Tensor(ctx, atTmp, out); @@ -3903,7 +4145,7 @@ diopiError_t diopiProd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::prod_out(atOut, atInput, *dim, keepdim); + CALL_ATEN_CUDA_FUNC(prod_out, atOut, atInput, *dim, keepdim); } return diopiSuccess; @@ -3911,14 +4153,15 @@ diopiError_t diopiProd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC diopiError_t diopiLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_weight, diopiTensorHandle_t grad_bias, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight) { + impl::aten::setCurStream(ctx); - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); - auto atWeight = impl::aten::buildATen(weight); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); + auto atWeight = impl::aten::buildATenSlow(weight); if (grad_input) { - auto atGradInput = impl::aten::buildATen(grad_input); - at::matmul_out(atGradInput, atGradOutput, atWeight); + auto atGradInput = impl::aten::buildATenSlow(grad_input); + CALL_ATEN_FUNC(matmul_out, atGradInput, atGradOutput, atWeight); } int64_t dims = atInput.dim(); @@ -3930,8 +4173,8 @@ diopiError_t diopiLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g for (int i = 0; i < dims - 2; ++i) { sumDim.push_back(i); } - auto atGradWeight = impl::aten::buildATen(grad_weight); - at::sum_out(atGradWeight, atGradWeightTemp, sumDim); + auto atGradWeight = impl::aten::buildATenSlow(grad_weight); + CALL_ATEN_CUDA_FUNC(sum_out, atGradWeight, atGradWeightTemp, sumDim); } else { impl::aten::updateATen2Tensor(ctx, atGradWeightTemp, grad_weight); } @@ -3942,8 +4185,8 @@ diopiError_t diopiLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g for (int i = 0; i < dims - 1; ++i) { sumDim.push_back(i); } - auto atGradBias = impl::aten::buildATen(grad_bias); - at::sum_out(atGradBias, atGradOutput, sumDim); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atGradOutput, sumDim); } return diopiSuccess; @@ -3953,9 +4196,9 @@ diopiError_t diopiCrossEntropyLossBackward(diopiContextHandle_t ctx, diopiTensor diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiReduction_t reduction, int64_t ignore_index, double label_smoothing) { impl::aten::setCurStream(ctx); - auto atGradOutput = impl::aten::buildATen(grad_output); - auto atInput = impl::aten::buildATen(input); - auto atTarget = impl::aten::buildATen(target); + auto atGradOutput = impl::aten::buildATenSlow(grad_output); + auto atInput = impl::aten::buildATenSlow(input); + auto atTarget = impl::aten::buildATenSlow(target); at::Tensor atGradInput; // case 1 @@ -3976,24 +4219,27 @@ diopiError_t diopiCrossEntropyLossBackward(diopiContextHandle_t ctx, diopiTensor } diopiError_t diopiErfinv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); - at::erfinv_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(erfinv_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiErfinvInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); - at::erfinv_out(atInput, atInput); + CALL_ATEN_CUDA_FUNC(erfinv_out, atInput, atInput); return diopiSuccess; } diopiError_t diopiIm2Col(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t dilation, diopiSize_t padding, diopiSize_t stride) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); @@ -4002,13 +4248,14 @@ diopiError_t diopiIm2Col(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); - at::im2col_out(atOut, atInput, atKernelSize, atDilation, atPadding, atStride); + CALL_ATEN_CUDA_FUNC(im2col_out, atOut, atInput, atKernelSize, atDilation, atPadding, atStride); return diopiSuccess; } diopiError_t diopiCol2Im(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size, diopiSize_t kernel_size, diopiSize_t dilation, diopiSize_t padding, diopiSize_t stride) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); @@ -4018,28 +4265,31 @@ diopiError_t diopiCol2Im(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); - at::col2im_out(atOut, atInput, atOutSize, atKernelSize, atDilation, atPadding, atStride); + CALL_ATEN_CUDA_FUNC(col2im_out, atOut, atInput, atOutSize, atKernelSize, atDilation, atPadding, atStride); return diopiSuccess; } diopiError_t diopiFlip(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t dims) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atInput = impl::aten::buildATen(input); at::IntArrayRef atDims = impl::aten::buildAtIntArray(dims); - at::flip_out(atOut, atInput, atDims); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(flip_out, atOut, atInput, atDims); return diopiSuccess; } diopiError_t diopiCholesky(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t info, diopiConstTensorHandle_t mat, bool upper, bool checkerror) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atMat = impl::aten::buildATen(mat); at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atInfo = impl::aten::buildATen(info); - at::linalg_cholesky_ex_out(atOut, atInfo, atMat, upper, checkerror); + CALL_ATEN_CUDA_FUNC(linalg_cholesky_ex_out, atOut, atInfo, atMat, upper, checkerror); return diopiSuccess; } @@ -4047,8 +4297,8 @@ diopiError_t diopiCholesky(diopiContextHandle_t ctx, diopiTensorHandle_t out, di diopiError_t diopiCholeskyBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_mat, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t L, bool upper) { impl::aten::setCurStream(ctx); - at::Tensor atL = impl::aten::buildATen(L); - at::Tensor atGradOut = impl::aten::buildATen(grad_output); + at::Tensor atL = impl::aten::buildATenSlow(L); + at::Tensor atGradOut = impl::aten::buildATenSlow(grad_output); if (upper) { atL = atL.transpose(-1, -2).conj(); atGradOut = atGradOut.transpose(-1, -2).conj(); @@ -4066,12 +4316,13 @@ diopiError_t diopiCholeskyBackward(diopiContextHandle_t ctx, diopiTensorHandle_t diopiError_t diopiTriangularSolve(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t cloned_mat, diopiConstTensorHandle_t b, diopiConstTensorHandle_t mat, bool upper, bool transpose, bool unitriangular) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atClonedMat = impl::aten::buildATen(cloned_mat); at::Tensor atOut = impl::aten::buildATen(out); at::Tensor atb = impl::aten::buildATen(b); at::Tensor atMat = impl::aten::buildATen(mat); - at::triangular_solve_out(atOut, atClonedMat, atb, atMat, upper, transpose, unitriangular); + CALL_ATEN_CUDA_FUNC(triangular_solve_out, atOut, atClonedMat, atb, atMat, upper, transpose, unitriangular); return diopiSuccess; } @@ -4080,15 +4331,15 @@ DIOPI_API diopiError_t diopiTriangularSolveBackward(diopiContextHandle_t ctx, di diopiConstTensorHandle_t grad_x, diopiConstTensorHandle_t grad_cloned_mat, diopiConstTensorHandle_t x, diopiConstTensorHandle_t b, diopiConstTensorHandle_t mat, bool upper, bool transpose, bool unitriangular) { impl::aten::setCurStream(ctx); - at::Tensor atGradB = impl::aten::buildATen(grad_b); - at::Tensor atGradM = impl::aten::buildATen(grad_mat); + at::Tensor atGradB = impl::aten::buildATenSlow(grad_b); + at::Tensor atGradM = impl::aten::buildATenSlow(grad_mat); - at::Tensor atGradx = impl::aten::buildATen(grad_x); - at::Tensor atGradCloneMat = impl::aten::buildATen(grad_cloned_mat); + at::Tensor atGradx = impl::aten::buildATenSlow(grad_x); + at::Tensor atGradCloneMat = impl::aten::buildATenSlow(grad_cloned_mat); - at::Tensor atx = impl::aten::buildATen(x); - at::Tensor atb = impl::aten::buildATen(b); - at::Tensor atMat = impl::aten::buildATen(mat); + at::Tensor atx = impl::aten::buildATenSlow(x); + at::Tensor atb = impl::aten::buildATenSlow(b); + at::Tensor atMat = impl::aten::buildATenSlow(mat); at::Tensor atGradb, atGradMat; if (atGradx.defined() || atGradCloneMat.defined()) { @@ -4116,7 +4367,7 @@ DIOPI_API diopiError_t diopiTriangularSolveBackward(diopiContextHandle_t ctx, di std::vector newShape{nums, atGradMat.size(-2), -1}; if (nums != 1) { at::IntArrayRef atShape(newShape.data(), newShape.size()); - at::sum_out(atGradM, atGradMat.reshape(atShape), 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradM, atGradMat.reshape(atShape), 0, false); } else { impl::aten::updateATen2Tensor(ctx, atGradMat, grad_mat); } @@ -4125,7 +4376,7 @@ DIOPI_API diopiError_t diopiTriangularSolveBackward(diopiContextHandle_t ctx, di if (nums != 1) { newShape[0] = nums; at::IntArrayRef atShape(newShape.data(), newShape.size()); - at::sum_out(atGradB, atGradb.reshape(atShape), 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradB, atGradb.reshape(atShape), 0, false); } else { impl::aten::updateATen2Tensor(ctx, atGradb, grad_b); } @@ -4135,22 +4386,25 @@ DIOPI_API diopiError_t diopiTriangularSolveBackward(diopiContextHandle_t ctx, di } diopiError_t diopiRepeat(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t repeats_size) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); at::Tensor atOut = impl::aten::buildATen(out); at::IntArrayRef atRepeatsSize = impl::aten::buildAtIntArray(repeats_size); - at::repeat_out(atOut, atInput, atRepeatsSize); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(repeat_out, atOut, atInput, atRepeatsSize); return diopiSuccess; } diopiError_t diopiMultinomial(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t num_samples, bool replacement, diopiGeneratorHandle_t generator) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::multinomial_out(atOut, atInput, num_samples, replacement, gen); + CALL_ATEN_CUDA_FUNC(multinomial_out, atOut, atInput, num_samples, replacement, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -4158,7 +4412,7 @@ diopiError_t diopiMultinomial(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiError_t diopiCastDtype(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - auto atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATenSlow(input); diopiDtype_t dtype; diopiGetTensorDtype(out, &dtype); auto atOut = at::native::to(atInput, impl::aten::getATenType(dtype).toScalarType(), false, true, c10::nullopt); @@ -4168,138 +4422,152 @@ diopiError_t diopiCastDtype(diopiContextHandle_t ctx, diopiTensorHandle_t out, d } diopiError_t diopiPolar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t abs, diopiConstTensorHandle_t angle) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atAbs = impl::aten::buildATen(abs); auto atAngle = impl::aten::buildATen(angle); - at::polar_out(atOut, atAbs, atAngle); + CALL_ATEN_CUDA_FUNC(polar_out, atOut, atAbs, atAngle); return diopiSuccess; } DIOPI_API diopiError_t diopiCeilInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::ceil_(atInput); return diopiSuccess; } DIOPI_API diopiError_t diopiCeil(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); - at::ceil_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(ceil_out, atOut, atInput); return diopiSuccess; } DIOPI_API diopiError_t diopiAsinInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + at::Tensor atInput = impl::aten::buildATenSlow(input); at::asin_(atInput); return diopiSuccess; } DIOPI_API diopiError_t diopiAsin(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); - at::asin_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(asin_out, atOut, atInput); return diopiSuccess; } DIOPI_API diopiError_t diopiLerpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t end, diopiConstTensorHandle_t weight) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); auto atEnd = impl::aten::buildATen(end); auto atWeight = impl::aten::buildATen(weight); - at::lerp_out(atOut, atInput, atEnd, atWeight); + CALL_ATEN_CUDA_FUNC(lerp_out, atOut, atInput, atEnd, atWeight); return diopiSuccess; } DIOPI_API diopiError_t diopiLerpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t end, const diopiScalar_t* weight) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); auto atEnd = impl::aten::buildATen(end); at::Scalar atWeight = impl::aten::buildAtScalar(weight); - at::lerp_out(atOut, atInput, atEnd, atWeight); + CALL_ATEN_CUDA_FUNC(lerp_out, atOut, atInput, atEnd, atWeight); return diopiSuccess; } DIOPI_API diopiError_t diopiTriu(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t diagonal) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); - at::triu_out(atOut, atInput, diagonal); + CALL_ATEN_CUDA_FUNC(triu_out, atOut, atInput, diagonal); return diopiSuccess; } DIOPI_API diopiError_t diopiTriuInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, int64_t diagonal) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - at::triu_out(atInput, atInput, diagonal); + CALL_ATEN_CUDA_FUNC(triu_out, atInput, atInput, diagonal); return diopiSuccess; } DIOPI_API diopiError_t diopiSgn(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); - at::sgn_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(sgn_out, atOut, atInput); return diopiSuccess; } DIOPI_API diopiError_t diopiSgnInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::Tensor atInput = impl::aten::buildATen(input); - at::sgn_out(atInput, atInput); + CALL_ATEN_CUDA_FUNC(sgn_out, atInput, atInput); return diopiSuccess; } DIOPI_API diopiError_t diopiIsNan(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::isnan_out(atOut, atInput); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(isnan_out, atOut, atInput); return diopiSuccess; } DIOPI_API diopiError_t diopiLinalgQR(diopiContextHandle_t ctx, diopiConstTensorHandle_t A, const char* mode, diopiTensorHandle_t Q, diopiTensorHandle_t R) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atA = impl::aten::buildATen(A); auto atQ = impl::aten::buildATen(Q); auto atR = impl::aten::buildATen(R); c10::string_view atMode(mode, strlen(mode)); - at::linalg_qr_out(atQ, atR, atA, mode); + CALL_ATEN_CUDA_FUNC(linalg_qr_out, atQ, atR, atA, mode); return diopiSuccess; } diopiError_t diopiAmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t self, diopiSize_t dim, bool keepdim) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); at::IntArrayRef atDim = impl::aten::buildAtIntArray(dim); auto atOut = impl::aten::buildATen(out); auto atSelf = impl::aten::buildATen(self); - at::amax_out(atOut, atSelf, atDim, keepdim); + CALL_ATEN_CUDA_FUNC(amax_out, atOut, atSelf, atDim, keepdim); return diopiSuccess; } diopiError_t diopiBatchNormStats(diopiContextHandle_t ctx, diopiTensorHandle_t mean, diopiTensorHandle_t invstd, diopiConstTensorHandle_t input, double eps) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMean = impl::aten::buildATen(mean); @@ -4307,7 +4575,8 @@ diopiError_t diopiBatchNormStats(diopiContextHandle_t ctx, diopiTensorHandle_t m if (atInput.scalar_type() == at::kHalf) { DIOPI_CHECK(atMean.scalar_type() == at::kFloat && atInvstd.scalar_type() == at::kFloat, "out dtype should follow the accumulated dtype in CUDA."); } - at::batch_norm_stats_out(atMean, atInvstd, atInput, eps); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(batch_norm_stats_out, atMean, atInvstd, atInput, eps); return diopiSuccess; } @@ -4316,6 +4585,7 @@ DIOPI_API diopiError_t diopiBatchNormGatherStatsWithCounts(diopiContextHandle_t diopiConstTensorHandle_t input, diopiConstTensorHandle_t mean_all, diopiConstTensorHandle_t invstd_all, diopiTensorHandle_t running_mean, diopiTensorHandle_t running_var, float momentum, float eps, diopiConstTensorHandle_t counts) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMean_all = impl::aten::buildATen(mean_all); @@ -4325,7 +4595,8 @@ DIOPI_API diopiError_t diopiBatchNormGatherStatsWithCounts(diopiContextHandle_t auto atCounts = impl::aten::buildATen(counts); auto atMean = impl::aten::buildATen(mean); auto atInvstd = impl::aten::buildATen(invstd); - at::batch_norm_gather_stats_with_counts_out(atMean, atInvstd, atInput, atMean_all, atInvstd_all, atRunning_mean, atRunning_var, momentum, eps, atCounts); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(batch_norm_gather_stats_with_counts_out, atMean, atInvstd, atInput, atMean_all, atInvstd_all, atRunning_mean, atRunning_var, momentum, eps, atCounts); return diopiSuccess; } @@ -4335,16 +4606,16 @@ DIOPI_API diopiError_t diopiBatchNormBackwardReduce(diopiContextHandle_t ctx, di diopiConstTensorHandle_t input, diopiConstTensorHandle_t mean, diopiConstTensorHandle_t invstd, diopiConstTensorHandle_t weight, bool input_g, bool weight_g, bool bias_g) { impl::aten::setCurStream(ctx); - auto atGradOut = impl::aten::buildATen(grad_out); - auto atInput = impl::aten::buildATen(input); - auto atMean = impl::aten::buildATen(mean); - auto atInvstd = impl::aten::buildATen(invstd); - auto atWeight = impl::aten::buildATen(weight); + auto atGradOut = impl::aten::buildATenSlow(grad_out); + auto atInput = impl::aten::buildATenSlow(input); + auto atMean = impl::aten::buildATenSlow(mean); + auto atInvstd = impl::aten::buildATenSlow(invstd); + auto atWeight = impl::aten::buildATenSlow(weight); if (sum_dy && sum_dy_xmu && grad_weight && grad_bias) { - auto atSumDy = impl::aten::buildATen(sum_dy); - auto atSumDyXmu = impl::aten::buildATen(sum_dy_xmu); - auto atGradWeight = impl::aten::buildATen(grad_weight); - auto atGradBias = impl::aten::buildATen(grad_bias); + auto atSumDy = impl::aten::buildATenSlow(sum_dy); + auto atSumDyXmu = impl::aten::buildATenSlow(sum_dy_xmu); + auto atGradWeight = impl::aten::buildATenSlow(grad_weight); + auto atGradBias = impl::aten::buildATenSlow(grad_bias); at::batch_norm_backward_reduce_out( atSumDy, atSumDyXmu, atGradWeight, atGradBias, atGradOut, atInput, atMean, atInvstd, atWeight, input_g, weight_g, bias_g); } else { @@ -4362,6 +4633,7 @@ DIOPI_API diopiError_t diopiBatchNormBackwardElemt(diopiContextHandle_t ctx, dio diopiConstTensorHandle_t input, diopiConstTensorHandle_t mean, diopiConstTensorHandle_t invstd, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t sum_dy, diopiConstTensorHandle_t sum_dy_xmu, diopiConstTensorHandle_t count) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atGradOut = impl::aten::buildATen(grad_out); auto atInput = impl::aten::buildATen(input); @@ -4372,13 +4644,15 @@ DIOPI_API diopiError_t diopiBatchNormBackwardElemt(diopiContextHandle_t ctx, dio auto atSumDyXmu = impl::aten::buildATen(sum_dy_xmu); auto atCount = impl::aten::buildATen(count); auto atGradInput = impl::aten::buildATen(grad_input); - at::batch_norm_backward_elemt_out(atGradInput, atGradOut, atInput, atMean, atInvstd, atWeight, atSumDy, atSumDyXmu, atCount); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(batch_norm_backward_elemt_out, atGradInput, atGradOut, atInput, atMean, atInvstd, atWeight, atSumDy, atSumDyXmu, atCount); return diopiSuccess; } DIOPI_API diopiError_t diopiBatchNormElemt(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiConstTensorHandle_t mean, diopiConstTensorHandle_t invstd, float eps) { + DIOPI_SCOPE_GUARD impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atMean = impl::aten::buildATen(mean); @@ -4386,7 +4660,7 @@ DIOPI_API diopiError_t diopiBatchNormElemt(diopiContextHandle_t ctx, diopiTensor auto atWeight = impl::aten::buildATen(weight); auto atBias = impl::aten::buildATen(bias); auto atOut = impl::aten::buildATen(out); - at::batch_norm_elemt_out(atOut, atInput, atWeight, atBias, atMean, atInvstd, eps); + CALL_ATEN_CUDA_FUNC(batch_norm_elemt_out, atOut, atInput, atWeight, atBias, atMean, atInvstd, eps); return diopiSuccess; } diff --git a/impl/torch/helper.cpp b/impl/torch/helper.cpp index 2b180c5188..c377d949c2 100644 --- a/impl/torch/helper.cpp +++ b/impl/torch/helper.cpp @@ -7,6 +7,8 @@ #include +thread_local int diopiNestedScopeDepth=0; + namespace impl { namespace aten { @@ -108,6 +110,14 @@ class BuildATenDeviceImpl { template at::Tensor buildATenImpl(diopiConstTensorHandle_t tensor) { + DeviceImpl::lazyInitDevice(); + auto atTensorHandle = const_cast(reinterpret_cast(tensor)); + atTensorHandle->unsafeGetTensorImpl()->set_custom_device(true); + return *atTensorHandle; +} + +template +at::Tensor buildATenImplSlow(diopiConstTensorHandle_t tensor) { diopiSize_t shape; diopiGetTensorShape(tensor, &shape); at::IntArrayRef atSizes(shape.data, shape.len); @@ -172,6 +182,24 @@ at::Tensor buildATen(diopiConstTensorHandle_t tensor) { } } +at::Tensor buildATenSlow(diopiConstTensorHandle_t tensor) { + if (tensor == nullptr) { + return at::Tensor(); + } + + diopiDevice_t device; + diopiGetTensorDevice(tensor, &device); + switch (device) { + case diopi_host: + return buildATenImplSlow>(tensor); + case diopi_device: + return buildATenImplSlow>(tensor); + default: + TORCH_CHECK(false, "Invalid device type encountered in buildATen: ", device); + return {}; + } +} + at::Scalar buildAtScalar(const diopiScalar_t* scalar) { if (scalar == nullptr) { NOT_SUPPORTED("scalar is null ptr, we use temporarily zero"); diff --git a/impl/torch/helper.hpp b/impl/torch/helper.hpp index 8e9875ea70..36554249be 100644 --- a/impl/torch/helper.hpp +++ b/impl/torch/helper.hpp @@ -49,10 +49,27 @@ using diopi_tensor_list = std::vector; +extern "C" thread_local int diopiNestedScopeDepth; + namespace impl { namespace aten { +class DiopiScopeGuardImpl { +public: + DiopiScopeGuardImpl() { + diopiNestedScopeDepth++; + std::cout << "DiopiNestedDepth is " << diopiNestedScopeDepth << std::endl; + }; + + ~DiopiScopeGuardImpl() { + diopiNestedScopeDepth--; + std::cout << "DiopiNestedDepth is " << diopiNestedScopeDepth << std::endl; + }; +}; + +#define DIOPI_SCOPE_GUARD [[maybe_unused]] impl::aten::DiopiScopeGuardImpl diopi_scope_guard; + constexpr size_t MAX_GPU_NUMS = 16; inline void setCurStream(diopiContextHandle_t ctx) { @@ -97,6 +114,7 @@ inline c10::DeviceType getATenDevice(diopiDevice_t device) { } at::Tensor buildATen(diopiConstTensorHandle_t tensor); +at::Tensor buildATenSlow(diopiConstTensorHandle_t tensor); inline bool isInt(const diopiScalar_t* scalar) { return scalar->stype <= 7; } @@ -117,9 +135,18 @@ inline decltype(auto) buildATenList(T* tensors, int64_t numTensors) { return vecAtTensor; } +template +inline decltype(auto) buildATenListSlow(T* tensors, int64_t numTensors) { + std::vector vecAtTensor; + for (size_t i = 0; i < numTensors; ++i) { + vecAtTensor.emplace_back(buildATenSlow(tensors[i])); + } + return vecAtTensor; +} + inline void updateATen2Tensor(diopiContextHandle_t ctx, const at::Tensor& atOut, diopiTensorHandle_t out) { if (out != nullptr) { - at::Tensor atOutput = buildATen(out).reshape_as(atOut); + at::Tensor atOutput = buildATenSlow(out).reshape_as(atOut); // Set non_blocking=true to improve performance. // The data is not ready when this function returns. at::native::copy_(atOutput, atOut, true);