Skip to content

Commit 20decf8

Browse files
committed
Merge branch 'feat/replace_opplugin_by_aclnn' of https://github.com/DeepLink-org/DIOPI into feat/replace_opplugin_by_aclnn
2 parents d7be303 + 532bb11 commit 20decf8

File tree

10 files changed

+115
-191
lines changed

10 files changed

+115
-191
lines changed

impl/ascend/aclnn/adaptor.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ inline aclTensor* createAclTensorFromDiopiTensor(diopiConstTensorHandle_t tensor
7676
if (tensor == nullptr) {
7777
return nullptr;
7878
}
79+
7980
diopiSize_t shape{};
8081
diopiGetTensorShape(tensor, &shape);
8182
diopiSize_t stride{};

impl/ascend/ascend_tensor.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,8 @@ aclFormat inferAclDataFormat(int64_t dim, const int64_t* shape, const int64_t* s
234234
return ACL_FORMAT_NHWC;
235235
}
236236
std::call_once(warningFlag, warnOnUnsupportedFormat, __FILE__, __LINE__, __FUNCTION__);
237+
} else if (dim == 3) {
238+
return ACL_FORMAT_NCL;
237239
}
238240
return ACL_FORMAT_ND;
239241
}

impl/ascend/device_configs.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,11 @@
808808
rtol=5e-2,
809809
atol_half=5e-2,
810810
rtol_half=5e-2,
811+
para=dict(
812+
# for aclnnGroupNorm, eps must be larger than 0.
813+
# aclnnGoupNorm do not support float16 input
814+
eps=[Skip(-1), Skip(0)],
815+
),
811816
tensor_para=dict(
812817
args=[
813818
{

impl/ascend/functions/batch_norm.cpp

Lines changed: 28 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -4,146 +4,48 @@
44
* @copyright (c) 2023, DeepLink.
55
*/
66

7-
#include "../common/acloprunner.hpp"
7+
#include "../aclnn/acl_scalar.hpp"
8+
#include "../aclnn/adaptor.hpp"
89

910
namespace impl {
1011
namespace ascend {
1112

12-
void updateInputAscendTensorDim(AscendTensor& inputAt, bool training) {
13-
int64_t dim = inputAt.dim();
14-
if (2 == dim) {
15-
inputAt.unsqueeze(2);
16-
inputAt.unsqueeze(3);
17-
} else if (3 == dim) {
18-
inputAt.unsqueeze(3);
19-
} else if (5 == dim && !training) {
20-
std::vector<int64_t> shape4d{inputAt.shape(0), inputAt.shape(1), inputAt.shape(2), inputAt.shape(3) * inputAt.shape(4)};
21-
inputAt.view(shape4d);
22-
}
23-
}
24-
25-
void batchNormBackwardTrainingUpdate(diopiContextHandle_t ctx, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias, AscendTensor gradOutputAt,
26-
AscendTensor inputAt, diopiConstTensorHandle_t saveMean, diopiConstTensorHandle_t saveInvstd, double eps) {
27-
std::string name = (inputAt.dim() == 5) ? "BN3DTrainingUpdateGrad" : "BNTrainingUpdateGrad";
28-
AclOpRunner<4, 2>(name, ctx)
29-
.addInput(gradOutputAt)
30-
.addInput(inputAt)
31-
.addInput(saveMean)
32-
.addInput(saveInvstd)
33-
.addOutput(gradWeight)
34-
.addOutput(gradBias)
35-
.setAttr<float>("epsilon", static_cast<float>(eps))
36-
.run();
37-
}
38-
39-
void batchNormBackwardTrainingReduceNocheck(diopiContextHandle_t ctx, AscendTensor gradInputAt, diopiConstTensorHandle_t gradWeight,
40-
diopiConstTensorHandle_t gradBias, AscendTensor gradOutputAt, AscendTensor inputAt, diopiConstTensorHandle_t weight,
41-
diopiConstTensorHandle_t saveMean, diopiConstTensorHandle_t saveInvstd, double eps) {
42-
std::string name = (inputAt.dim() == 5) ? "BN3DTrainingReduceGrad" : "BNTrainingReduceGrad";
43-
AclOpRunner<7, 1>(name, ctx)
44-
.addInput(gradOutputAt)
45-
.addInput(inputAt)
46-
.addInput(gradWeight)
47-
.addInput(gradBias)
48-
.addInput(weight)
49-
.addInput(saveMean)
50-
.addInput(saveInvstd)
51-
.addOutput(gradInputAt)
52-
.setAttr<float>("epsilon", static_cast<float>(eps))
53-
.run();
54-
}
55-
5613
diopiError_t diopiBatchNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t saveMean, diopiTensorHandle_t saveInvstd,
5714
diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiTensorHandle_t runningMean,
5815
diopiTensorHandle_t runningVar, bool training, double momentum, double eps) {
59-
AscendTensor inputAt(input), outputAt(out);
60-
updateInputAscendTensorDim(inputAt, training);
61-
outputAt.view(inputAt.getAclMemShape());
62-
63-
std::vector<int64_t> batchShapeV{inputAt.shape(1)};
64-
diopiSize_t batchShapeSizeT{batchShapeV.data(), static_cast<int64_t>(batchShapeV.size())};
65-
diopiTensorHandle_t weightTemp = createTensorIfNullptrOrConstCast(ctx, weight, batchShapeSizeT, inputAt.dtype(), true, 1);
66-
diopiTensorHandle_t biasTemp = createTensorIfNullptrOrConstCast(ctx, bias, batchShapeSizeT, inputAt.dtype(), true, 0);
67-
diopiTensorHandle_t runningMeanTemp = createTensorIfNullptrOrConstCast(ctx, runningMean, batchShapeSizeT, inputAt.dtype(), true, 0);
68-
diopiTensorHandle_t runningVarTemp = createTensorIfNullptrOrConstCast(ctx, runningVar, batchShapeSizeT, inputAt.dtype(), true, 1);
69-
70-
if (!training) {
71-
AclOpRunner<5, 1>("BNInfer", ctx)
72-
.addInput(inputAt)
73-
.addInput(weightTemp)
74-
.addInput(biasTemp)
75-
.addInput(runningMeanTemp)
76-
.addInput(runningVarTemp)
77-
.addOutput(outputAt)
78-
.setAttr("epsilon", static_cast<float>(eps))
79-
.run();
80-
81-
diopiTensorHandle_t runningVarBroadcasted;
82-
makeTensorLike(ctx, &runningVarBroadcasted, input);
83-
AscendTensor runningVarAt(runningVar);
84-
runningVarAt.unsqueeze(0);
85-
runningVarAt.unsqueeze(2);
86-
runningVarAt.unsqueeze(3);
87-
AclOpRunner<2, 1>("BroadcastTo", ctx).addInput(runningVarAt).addConstInput(inputAt.shape()).addOutput(runningVarBroadcasted).run();
88-
} else {
89-
diopiTensorHandle_t sum = nullptr, squareSum = nullptr;
90-
diopiSize_t shape, stride;
91-
diopiGetTensorShape(runningMeanTemp, &shape);
92-
diopiGetTensorStride(runningMeanTemp, &stride);
93-
diopiRequireTensor(ctx, &sum, &shape, &stride, diopiDtype_t::diopi_dtype_float32, diopi_device);
94-
diopiRequireTensor(ctx, &squareSum, &shape, &stride, diopiDtype_t::diopi_dtype_float32, diopi_device);
95-
AclOpRunner<1, 2>("BNTrainingReduce", ctx).addInput(inputAt).addOutput(sum).setAttr("epsilon", static_cast<float>(eps)).addOutput(squareSum).run();
96-
AclOpRunner<7, 5>("BNTrainingUpdate", ctx)
97-
.addInput(inputAt)
98-
.addInput(sum)
99-
.addInput(squareSum)
100-
.addInput(weightTemp)
101-
.addInput(biasTemp)
102-
.addInput(runningMeanTemp)
103-
.addInput(runningVarTemp)
104-
.setAttr("epsilon", static_cast<float>(eps))
105-
.setAttr("factor", static_cast<float>(momentum))
106-
.addOutput(outputAt)
107-
.addOutput(runningMeanTemp)
108-
.addOutput(runningVarTemp)
109-
.addOutput(saveMean)
110-
.addOutput(saveInvstd)
111-
.run();
112-
}
16+
DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNorm, ctx, input, weight, bias, runningMean, runningVar, training, momentum, eps, out, saveMean, saveInvstd);
11317
return diopiSuccess;
11418
}
11519

11620
diopiError_t diopiBatchNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias,
11721
diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight,
118-
diopiConstTensorHandle_t runninMean, diopiConstTensorHandle_t runningVar, diopiConstTensorHandle_t saveMean,
22+
diopiConstTensorHandle_t runningMean, diopiConstTensorHandle_t runningVar, diopiConstTensorHandle_t saveMean,
11923
diopiConstTensorHandle_t saveInvstd, bool training, double eps) {
120-
AscendTensor inputAt(input), gradOutputAt(gradOutput), gradInputAt(gradInput);
121-
updateInputAscendTensorDim(inputAt, training);
122-
gradOutputAt.view(inputAt.getAclMemShape());
123-
gradInputAt.view(inputAt.getAclMemShape());
124-
125-
if (!training) {
126-
batchNormBackwardTrainingUpdate(ctx, gradWeight, gradBias, gradOutputAt, inputAt, runninMean, runningVar, eps);
127-
128-
AclOpRunner<3, 1>("BNInferGrad", ctx)
129-
.addInput(gradOutputAt)
130-
.addInput(weight)
131-
.addInput(runningVar)
132-
.addOutput(gradInputAt)
133-
.setAttr<float>("epsilon", static_cast<float>(eps))
134-
.run();
135-
136-
diopiTensorHandle_t runningVarBroadcasted;
137-
makeTensorLike(ctx, &runningVarBroadcasted, input);
138-
AscendTensor runningVarAt(runningVar);
139-
runningVarAt.unsqueeze(0);
140-
runningVarAt.unsqueeze(2);
141-
runningVarAt.unsqueeze(3);
142-
AclOpRunner<2, 1>("BroadcastTo", ctx).addInput(runningVarAt).addConstInput(inputAt.shape()).addOutput(runningVarBroadcasted).run();
143-
} else {
144-
batchNormBackwardTrainingUpdate(ctx, gradWeight, gradBias, gradOutputAt, inputAt, saveMean, saveInvstd, eps);
145-
batchNormBackwardTrainingReduceNocheck(ctx, gradInputAt, gradWeight, gradBias, gradOutputAt, inputAt, weight, saveMean, saveInvstd, eps);
24+
std::array<bool, 3> gradMask = {true, true, true};
25+
if (nullptr == gradInput) {
26+
gradMask[0] = false;
27+
}
28+
if (nullptr == gradWeight) {
29+
gradMask[1] = false;
30+
}
31+
if (nullptr == gradBias) {
32+
gradMask[2] = false;
14633
}
34+
DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNormBackward,
35+
ctx,
36+
gradOutput,
37+
input,
38+
weight,
39+
runningMean,
40+
runningVar,
41+
saveMean,
42+
saveInvstd,
43+
training,
44+
eps,
45+
gradMask,
46+
gradInput,
47+
gradWeight,
48+
gradBias);
14749
return diopiSuccess;
14850
}
14951

impl/ascend/functions/dropout.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ diopiError_t npuDropoutOut(diopiContextHandle_t ctx, diopiTensorHandle_t out, di
2323
diopiError_t ret = diopiRequireTensor(ctx, &maskNpu, &maskSize, nullptr, diopi_dtype_uint8, diopi_device);
2424
ASCEND_CHECK_ABORT(ret == diopiSuccess, "[npuDropoutOut] require tensor for mask failed.");
2525

26-
uint64_t seed, offset;
27-
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
26+
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
27+
const uint64_t seed = gen.first;
28+
const uint64_t offset = gen.second;
2829

2930
DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutGenMask, ctx, inAt.shape(), p, seed, offset, maskNpu);
3031
DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutDoMask, ctx, input, maskNpu, p, out);
@@ -57,8 +58,9 @@ diopiError_t npuDropout2dOut(diopiContextHandle_t ctx, diopiTensorHandle_t out,
5758
diopiError_t ret = diopiRequireTensor(ctx, &maskNpu, &maskNpuSize, nullptr, diopi_dtype_uint8, diopi_device);
5859
ASCEND_CHECK_ABORT(ret == diopiSuccess, "[npuDropout2dOut] require tensor for mask failed.");
5960

60-
uint64_t seed, offset;
61-
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
61+
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
62+
const uint64_t seed = gen.first;
63+
const uint64_t offset = gen.second;
6264

6365
DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutGenMask, ctx, inAt.shape(), p, seed, offset, maskNpu);
6466
DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutDoMask, ctx, input2d, maskNpu, p, out2d);

impl/ascend/functions/group_norm.cpp

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,58 @@
44
* @copyright (c) 2023, DeepLink.
55
*/
66

7-
#include "../common/acloprunner.hpp"
7+
#include <cmath>
8+
9+
#include "../aclnn/acl_scalar.hpp"
10+
#include "../aclnn/adaptor.hpp"
11+
#include "../common/utils.hpp"
812

913
namespace impl {
1014
namespace ascend {
1115

1216
DIOPI_API diopiError_t diopiGroupNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t saveMean, diopiTensorHandle_t saveInvstd,
1317
diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, int64_t numGroups,
1418
double eps) {
15-
if (0 == AscendTensor(input).numel()) {
16-
AclOpRunner<1, 1>("Fills", ctx).addInput(out).setAttr<float>("value", 0).addOutput(out).run();
19+
AscendTensor inputAt(input);
20+
if (!inputAt.defined() || inputAt.numel() == 0) {
1721
return diopiSuccess;
1822
}
1923

20-
AclOpRunner<3, 3>("GroupNorm", ctx)
21-
.addInput(input)
22-
.addInput(weight)
23-
.addInput(bias)
24-
.setAttr("num_groups", static_cast<int32_t>(numGroups))
25-
.setAttr("epsilon", static_cast<float>(eps))
26-
.setAttr("data_format", std::string{getAclDataFormat(input) == ACL_FORMAT_ND ? "ND" : "NCHW"})
27-
.setAttr("is_training", true)
28-
.addOutput(out)
29-
.addOutput(saveMean)
30-
.addOutput(saveInvstd)
31-
.run();
24+
int64_t n = inputAt.shape(0);
25+
int64_t c = inputAt.shape(1);
26+
int64_t hw = inputAt.numel() / (n * c);
27+
28+
DIOPI_ASCEND_CALL_ACLNN(aclnnGroupNorm, ctx, input, weight, bias, n, c, hw, numGroups, eps, out, saveMean, saveInvstd);
29+
return diopiSuccess;
30+
}
31+
32+
diopiError_t diopiGroupNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias,
33+
diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight,
34+
diopiConstTensorHandle_t mean, diopiConstTensorHandle_t rstd, int64_t numGroups) {
35+
AscendTensor inputAt(input);
36+
AscendTensor gradWeightAt(gradWeight);
37+
38+
if (!inputAt.defined()) {
39+
return diopiSuccess;
40+
}
41+
42+
if (inputAt.numel() == 0) {
43+
DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceZero, ctx, gradBias);
44+
if (inputAt.shape(0) == 0 || inputAt.shape(1) == 0) {
45+
DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceZero, ctx, gradWeight);
46+
} else {
47+
diopiScalar_t nanScalar = constructDiopiScalarT(gradWeightAt.dtype(), std::nanf(""));
48+
DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceFillScalar, ctx, gradWeightAt, &nanScalar);
49+
}
50+
} else {
51+
int64_t n = inputAt.shape(0);
52+
int64_t c = inputAt.shape(1);
53+
int64_t hw = inputAt.numel() / (n * c);
54+
55+
std::array<bool, 3> gradMask = {gradInput != nullptr, gradWeight != nullptr, gradBias != nullptr};
56+
DIOPI_ASCEND_CALL_ACLNN(
57+
aclnnGroupNormBackward, ctx, gradOutput, inputAt, mean, rstd, weight, n, c, hw, numGroups, gradMask, gradInput, gradWeightAt, gradBias);
58+
}
3259
return diopiSuccess;
3360
}
3461

impl/ascend/functions/normal.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ diopiError_t diopiNormal(diopiContextHandle_t ctx, diopiTensorHandle_t out, doub
1616
return diopiSuccess;
1717
}
1818

19-
uint64_t seed, offset;
20-
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
19+
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
20+
const uint64_t seed = gen.first;
21+
const uint64_t offset = gen.second;
2122

2223
float meanCast = static_cast<float>(mean);
2324
float rstdCast = static_cast<float>(std);
@@ -26,8 +27,9 @@ diopiError_t diopiNormal(diopiContextHandle_t ctx, diopiTensorHandle_t out, doub
2627
}
2728

2829
diopiError_t diopiNormalInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, double mean, double std, diopiGeneratorHandle_t generator) {
29-
uint64_t seed, offset;
30-
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
30+
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
31+
const uint64_t seed = gen.first;
32+
const uint64_t offset = gen.second;
3133

3234
float meanCast = static_cast<float>(mean);
3335
float rstdCast = static_cast<float>(std);
@@ -42,8 +44,9 @@ diopiError_t diopiNormalTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out
4244
return diopiSuccess;
4345
}
4446

45-
uint64_t seed, offset;
46-
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
47+
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
48+
const uint64_t seed = gen.first;
49+
const uint64_t offset = gen.second;
4750

4851
DIOPI_ASCEND_CALL_ACLNN(aclnnNormalTensorTensor, ctx, mean, std, seed, offset, out);
4952
return diopiSuccess;
@@ -56,8 +59,9 @@ diopiError_t diopiNormalScalarTensor(diopiContextHandle_t ctx, diopiTensorHandle
5659
return diopiSuccess;
5760
}
5861

59-
uint64_t seed, offset;
60-
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
62+
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
63+
const uint64_t seed = gen.first;
64+
const uint64_t offset = gen.second;
6165

6266
float meanCast = static_cast<float>(mean);
6367
DIOPI_ASCEND_CALL_ACLNN(aclnnNormalFloatTensor, ctx, meanCast, std, seed, offset, out);
@@ -71,8 +75,9 @@ diopiError_t diopiNormalTensorScalar(diopiContextHandle_t ctx, diopiTensorHandle
7175
return diopiSuccess;
7276
}
7377

74-
uint64_t seed, offset;
75-
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
78+
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
79+
const uint64_t seed = gen.first;
80+
const uint64_t offset = gen.second;
7681

7782
float rstdCast = static_cast<float>(std);
7883
DIOPI_ASCEND_CALL_ACLNN(aclnnNormalTensorFloat, ctx, mean, rstdCast, seed, offset, out);

0 commit comments

Comments
 (0)