Skip to content

Commit f60eec5

Browse files
authored
[VectorCombine] foldPermuteOfBinops - support multi-use binary ops and operands in shuffle folding (#173153)
Fixes #173033 This patch extends VectorCombine to fold binary operations through shuffles in scenarios involving multiple uses of both the binary operator and its operands. Previously, the transformation was restricted to single-use cases to prevent instruction duplication. This change implements a cost-based evaluation that allows the fold even when: 1. The binary operator has multiple users (requiring duplication of the arithmetic instruction). 2. The operands of the binary operator (the shuffles) have multiple users (requiring the original shuffles to be preserved). The optimization is performed if the TTI cost of the new instruction sequence—including any duplicated arithmetic—is lower than the cost of the shuffle sequence it replaces. This is particularly beneficial on X86 targets for expensive cross-lane shuffles.
1 parent b785c99 commit f60eec5

File tree

2 files changed

+64
-19
lines changed

2 files changed

+64
-19
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2280,8 +2280,7 @@ bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
22802280
bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
22812281
BinaryOperator *BinOp;
22822282
ArrayRef<int> OuterMask;
2283-
if (!match(&I,
2284-
m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2283+
if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
22852284
return false;
22862285

22872286
// Don't introduce poison into div/rem.
@@ -2290,12 +2289,10 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
22902289

22912290
Value *Op00, *Op01, *Op10, *Op11;
22922291
ArrayRef<int> Mask0, Mask1;
2293-
bool Match0 =
2294-
match(BinOp->getOperand(0),
2295-
m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2296-
bool Match1 =
2297-
match(BinOp->getOperand(1),
2298-
m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2292+
bool Match0 = match(BinOp->getOperand(0),
2293+
m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2294+
bool Match1 = match(BinOp->getOperand(1),
2295+
m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
22992296
if (!Match0 && !Match1)
23002297
return false;
23012298

@@ -2340,22 +2337,35 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
23402337
all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
23412338
ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
23422339

2340+
InstructionCost NewCost = 0;
23432341
// Try to merge shuffles across the binop if the new shuffles are not costly.
2342+
InstructionCost BinOpCost =
2343+
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
23442344
InstructionCost OldCost =
2345-
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2346-
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy,
2347-
BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2348-
if (Match0)
2349-
OldCost += TTI.getShuffleCost(
2345+
BinOpCost + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
2346+
ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2347+
0, nullptr, {BinOp}, &I);
2348+
if (!BinOp->hasOneUse())
2349+
NewCost += BinOpCost;
2350+
2351+
if (Match0) {
2352+
InstructionCost Shuf0Cost = TTI.getShuffleCost(
23502353
TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
23512354
0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2352-
if (Match1)
2353-
OldCost += TTI.getShuffleCost(
2354-
TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2355+
OldCost += Shuf0Cost;
2356+
if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2357+
NewCost += Shuf0Cost;
2358+
}
2359+
if (Match1) {
2360+
InstructionCost Shuf1Cost = TTI.getShuffleCost(
2361+
TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
23552362
0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2363+
OldCost += Shuf1Cost;
2364+
if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2365+
NewCost += Shuf1Cost;
2366+
}
23562367

2357-
InstructionCost NewCost =
2358-
TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2368+
NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
23592369

23602370
if (!IsIdentity0)
23612371
NewCost +=

llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ define <4 x float> @fadd_v4f32_mixed_types(<4 x float> %a0) {
6464
ret <4 x float> %post
6565
}
6666

67-
; Negative test - multiple use of fadd
6867
define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
6968
; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_op(
7069
; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
@@ -101,6 +100,42 @@ define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %
101100
ret <4 x double> %post
102101
}
103102

103+
declare void @use_v32i8(<32 x i8>)
104+
define <32 x i8> @max_expense_multi_use_triggered(<32 x i8> %a, <32 x i8> %b) {
105+
; CHECK-LABEL: define <32 x i8> @max_expense_multi_use_triggered(
106+
; CHECK-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
107+
; CHECK-NEXT: [[A1:%.*]] = shufflevector <32 x i8> [[A]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
108+
; CHECK-NEXT: [[B1:%.*]] = shufflevector <32 x i8> [[B]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
109+
; CHECK-NEXT: [[OP:%.*]] = add <32 x i8> [[A1]], [[B1]]
110+
; CHECK-NEXT: call void @use_v32i8(<32 x i8> [[OP]])
111+
; CHECK-NEXT: [[POST:%.*]] = add <32 x i8> [[A]], [[B]]
112+
; CHECK-NEXT: ret <32 x i8> [[POST]]
113+
;
114+
%a1 = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
115+
%b1 = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
116+
%op = add <32 x i8> %a1, %b1
117+
call void @use_v32i8(<32 x i8> %op)
118+
%post = shufflevector <32 x i8> %op, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
119+
ret <32 x i8> %post
120+
}
121+
122+
define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(<4 x double> %a, <4 x double> %b) {
123+
; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle_triggers(
124+
; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
125+
; CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
126+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
127+
; CHECK-NEXT: [[POST:%.*]] = fadd <4 x double> [[A]], [[TMP1]]
128+
; CHECK-NEXT: call void @use_v4f64(<4 x double> [[A1]])
129+
; CHECK-NEXT: ret <4 x double> [[POST]]
130+
;
131+
%a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
132+
%b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
133+
%op = fadd <4 x double> %a1, %b1
134+
%post = shufflevector <4 x double> %op, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
135+
call void @use_v4f64(<4 x double> %a1)
136+
ret <4 x double> %post
137+
}
138+
104139
define <4 x i32> @sdiv_v4i32(<4 x i32> %a, <4 x i32> %b) {
105140
; CHECK-LABEL: define <4 x i32> @sdiv_v4i32(
106141
; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {

0 commit comments

Comments
 (0)