Skip to content

Commit 7ee408d

Browse files
committed
[Testing Only] Only fold flat offsets if they are inbounds PTRADDs
Squash of the following upstream PRs for downstream testing: - llvm#165424 - llvm#165425 - llvm#165426 - llvm#165427 Regenerated outputs for the following tests to resolve merge conflicts: - llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll - llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll For SWDEV-516125.
1 parent 27cda2d commit 7ee408d

33 files changed

+7483
-6516
lines changed

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,7 +1113,8 @@ class SelectionDAG {
11131113
SDValue Mask, SDValue EVL);
11141114

11151115
/// Returns sum of the base pointer and offset.
1116-
/// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default.
1116+
/// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by
1117+
/// default.
11171118
LLVM_ABI SDValue
11181119
getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL,
11191120
const SDNodeFlags Flags = SDNodeFlags());
@@ -1123,15 +1124,18 @@ class SelectionDAG {
11231124

11241125
/// Create an add instruction with appropriate flags when used for
11251126
/// addressing some offset of an object. i.e. if a load is split into multiple
1126-
/// components, create an add nuw from the base pointer to the offset.
1127+
/// components, create an add nuw (or ptradd nuw inbounds) from the base
1128+
/// pointer to the offset.
11271129
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) {
1128-
return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
1130+
return getMemBasePlusOffset(
1131+
Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
11291132
}
11301133

11311134
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) {
11321135
// The object itself can't wrap around the address space, so it shouldn't be
11331136
// possible for the adds of the offsets to the split parts to overflow.
1134-
return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
1137+
return getMemBasePlusOffset(
1138+
Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
11351139
}
11361140

11371141
/// Return a new CALLSEQ_START node, that starts new call frame, in which

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5649,17 +5649,35 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
56495649
/// Get a pointer to vector element \p Idx located in memory for a vector of
56505650
/// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
56515651
/// bounds the returned pointer is unspecified, but will be within the vector
5652-
/// bounds.
5653-
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
5654-
SDValue Index) const;
5652+
/// bounds. \p PtrArithFlags can be used to mark that arithmetic within the
5653+
/// vector in memory is known to not wrap or to be inbounds.
5654+
SDValue getVectorElementPointer(
5655+
SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index,
5656+
const SDNodeFlags PtrArithFlags = SDNodeFlags()) const;
5657+
5658+
/// Get a pointer to vector element \p Idx located in memory for a vector of
5659+
/// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
5660+
/// bounds the returned pointer is unspecified, but will be within the vector
5661+
/// bounds. \p VecPtr is guaranteed to point to the beginning of a memory
5662+
/// location large enough for the vector.
5663+
SDValue getInboundsVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr,
5664+
EVT VecVT, SDValue Index) const {
5665+
return getVectorElementPointer(DAG, VecPtr, VecVT, Index,
5666+
SDNodeFlags::NoUnsignedWrap |
5667+
SDNodeFlags::InBounds);
5668+
}
56555669

56565670
/// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located
56575671
/// in memory for a vector of type \p VecVT starting at a base address of
56585672
/// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the
56595673
/// returned pointer is unspecified, but the value returned will be such that
5660-
/// the entire subvector would be within the vector bounds.
5661-
SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
5662-
EVT SubVecVT, SDValue Index) const;
5674+
/// the entire subvector would be within the vector bounds. \p PtrArithFlags
5675+
/// can be used to mark that arithmetic within the vector in memory is known
5676+
/// to not wrap or to be inbounds.
5677+
SDValue
5678+
getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
5679+
EVT SubVecVT, SDValue Index,
5680+
const SDNodeFlags PtrArithFlags = SDNodeFlags()) const;
56635681

56645682
/// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This
56655683
/// method accepts integers as its arguments.

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2715,6 +2715,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
27152715
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
27162716
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
27172717
AddToWorklist(Add.getNode());
2718+
// We can't set InBounds even if both original ptradds were InBounds and
2719+
// NUW: SDAG usually represents pointers as integers, therefore, the
2720+
// matched pattern behaves as if it had implicit casts:
2721+
// (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds x, y))), z)
2722+
// The outer inbounds ptradd might therefore rely on a provenance that x
2723+
// does not have.
27182724
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
27192725
}
27202726
}
@@ -2740,6 +2746,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
27402746
// that.
27412747
SDNodeFlags Flags =
27422748
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2749+
// We can't set InBounds even if both original ptradds were InBounds and
2750+
// NUW: SDAG usually represents pointers as integers, therefore, the
2751+
// matched pattern behaves as if it had implicit casts:
2752+
// (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds GA, v))), c)
2753+
// The outer inbounds ptradd might therefore rely on a provenance that
2754+
// GA does not have.
27432755
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
27442756
AddToWorklist(Inner.getNode());
27452757
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
@@ -2763,8 +2775,13 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
27632775
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
27642776

27652777
// If both additions in the original were NUW, reassociation preserves that.
2766-
SDNodeFlags ReassocFlags =
2767-
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2778+
SDNodeFlags CommonFlags = N->getFlags() & N1->getFlags();
2779+
SDNodeFlags ReassocFlags = CommonFlags & SDNodeFlags::NoUnsignedWrap;
2780+
if (CommonFlags.hasNoUnsignedWrap()) {
2781+
// If both operations are NUW and the PTRADD is inbounds, the offests are
2782+
// both non-negative, so the reassociated PTRADDs are also inbounds.
2783+
ReassocFlags |= N->getFlags() & SDNodeFlags::InBounds;
2784+
}
27682785

27692786
if (ZIsConstant != YIsConstant) {
27702787
if (YIsConstant)
@@ -22745,7 +22762,10 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
2274522762
NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(COffset), DL);
2274622763
PointerInfo = ST->getPointerInfo().getWithOffset(COffset);
2274722764
} else {
22748-
NewPtr = TLI.getVectorElementPointer(DAG, Ptr, Value.getValueType(), Idx);
22765+
// The original DAG loaded the entire vector from memory, so arithmetic
22766+
// within it must be inbounds.
22767+
NewPtr = TLI.getInboundsVectorElementPointer(DAG, Ptr, Value.getValueType(),
22768+
Idx);
2274922769
}
2275022770

2275122771
return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(),

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10668,19 +10668,20 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx,
1066810668
DAG.getConstant(MaxIndex, dl, IdxVT));
1066910669
}
1067010670

10671-
SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
10672-
SDValue VecPtr, EVT VecVT,
10673-
SDValue Index) const {
10671+
SDValue
10672+
TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr,
10673+
EVT VecVT, SDValue Index,
10674+
const SDNodeFlags PtrArithFlags) const {
1067410675
return getVectorSubVecPointer(
1067510676
DAG, VecPtr, VecVT,
1067610677
EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1),
10677-
Index);
10678+
Index, PtrArithFlags);
1067810679
}
1067910680

10680-
SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
10681-
SDValue VecPtr, EVT VecVT,
10682-
EVT SubVecVT,
10683-
SDValue Index) const {
10681+
SDValue
10682+
TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr,
10683+
EVT VecVT, EVT SubVecVT, SDValue Index,
10684+
const SDNodeFlags PtrArithFlags) const {
1068410685
SDLoc dl(Index);
1068510686
// Make sure the index type is big enough to compute in.
1068610687
Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());
@@ -10704,7 +10705,7 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
1070410705

1070510706
Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
1070610707
DAG.getConstant(EltSize, dl, IdxVT));
10707-
return DAG.getMemBasePlusOffset(VecPtr, Index, dl);
10708+
return DAG.getMemBasePlusOffset(VecPtr, Index, dl, PtrArithFlags);
1070810709
}
1070910710

1071010711
//===----------------------------------------------------------------------===//
@@ -12382,8 +12383,10 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT,
1238212383
!IsFast)
1238312384
return SDValue();
1238412385

12385-
SDValue NewPtr =
12386-
getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
12386+
// The original DAG loaded the entire vector from memory, so arithmetic
12387+
// within it must be inbounds.
12388+
SDValue NewPtr = getInboundsVectorElementPointer(
12389+
DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
1238712390

1238812391
// We are replacing a vector load with a scalar load. The new load must have
1238912392
// identical memory op ordering to the original.

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 76 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1828,72 +1828,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
18281828
isFlatScratchBaseLegal(Addr))) {
18291829
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
18301830

1831-
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1832-
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1833-
Addr = N0;
1834-
OffsetVal = COffsetVal;
1835-
} else {
1836-
// If the offset doesn't fit, put the low bits into the offset field and
1837-
// add the rest.
1838-
//
1839-
// For a FLAT instruction the hardware decides whether to access
1840-
// global/scratch/shared memory based on the high bits of vaddr,
1841-
// ignoring the offset field, so we have to ensure that when we add
1842-
// remainder to vaddr it still points into the same underlying object.
1843-
// The easiest way to do that is to make sure that we split the offset
1844-
// into two pieces that are both >= 0 or both <= 0.
1845-
1846-
SDLoc DL(N);
1847-
uint64_t RemainderOffset;
1848-
1849-
std::tie(OffsetVal, RemainderOffset) =
1850-
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1851-
1852-
SDValue AddOffsetLo =
1853-
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1854-
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1855-
1856-
if (Addr.getValueType().getSizeInBits() == 32) {
1857-
SmallVector<SDValue, 3> Opnds;
1858-
Opnds.push_back(N0);
1859-
Opnds.push_back(AddOffsetLo);
1860-
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1861-
if (Subtarget->hasAddNoCarry()) {
1862-
AddOp = AMDGPU::V_ADD_U32_e64;
1863-
Opnds.push_back(Clamp);
1864-
}
1865-
Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1831+
// Adding the offset to the base address in a FLAT instruction must not
1832+
// change the memory aperture in which the address falls. Therefore we can
1833+
// only fold offsets from inbounds GEPs into FLAT instructions.
1834+
bool IsInBounds =
1835+
Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1836+
if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1837+
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1838+
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1839+
Addr = N0;
1840+
OffsetVal = COffsetVal;
18661841
} else {
1867-
// TODO: Should this try to use a scalar add pseudo if the base address
1868-
// is uniform and saddr is usable?
1869-
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1870-
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1871-
1872-
SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1873-
DL, MVT::i32, N0, Sub0);
1874-
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1875-
DL, MVT::i32, N0, Sub1);
1876-
1877-
SDValue AddOffsetHi =
1878-
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1879-
1880-
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1881-
1882-
SDNode *Add =
1883-
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1884-
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1885-
1886-
SDNode *Addc = CurDAG->getMachineNode(
1887-
AMDGPU::V_ADDC_U32_e64, DL, VTs,
1888-
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1889-
1890-
SDValue RegSequenceArgs[] = {
1891-
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1892-
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1893-
1894-
Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1895-
MVT::i64, RegSequenceArgs),
1896-
0);
1842+
// If the offset doesn't fit, put the low bits into the offset field
1843+
// and add the rest.
1844+
//
1845+
// For a FLAT instruction the hardware decides whether to access
1846+
// global/scratch/shared memory based on the high bits of vaddr,
1847+
// ignoring the offset field, so we have to ensure that when we add
1848+
// remainder to vaddr it still points into the same underlying object.
1849+
// The easiest way to do that is to make sure that we split the offset
1850+
// into two pieces that are both >= 0 or both <= 0.
1851+
1852+
SDLoc DL(N);
1853+
uint64_t RemainderOffset;
1854+
1855+
std::tie(OffsetVal, RemainderOffset) =
1856+
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1857+
1858+
SDValue AddOffsetLo =
1859+
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1860+
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1861+
1862+
if (Addr.getValueType().getSizeInBits() == 32) {
1863+
SmallVector<SDValue, 3> Opnds;
1864+
Opnds.push_back(N0);
1865+
Opnds.push_back(AddOffsetLo);
1866+
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1867+
if (Subtarget->hasAddNoCarry()) {
1868+
AddOp = AMDGPU::V_ADD_U32_e64;
1869+
Opnds.push_back(Clamp);
1870+
}
1871+
Addr =
1872+
SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1873+
} else {
1874+
// TODO: Should this try to use a scalar add pseudo if the base
1875+
// address is uniform and saddr is usable?
1876+
SDValue Sub0 =
1877+
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1878+
SDValue Sub1 =
1879+
CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1880+
1881+
SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1882+
DL, MVT::i32, N0, Sub0);
1883+
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1884+
DL, MVT::i32, N0, Sub1);
1885+
1886+
SDValue AddOffsetHi =
1887+
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1888+
1889+
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1890+
1891+
SDNode *Add =
1892+
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1893+
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1894+
1895+
SDNode *Addc = CurDAG->getMachineNode(
1896+
AMDGPU::V_ADDC_U32_e64, DL, VTs,
1897+
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1898+
1899+
SDValue RegSequenceArgs[] = {
1900+
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1901+
MVT::i32),
1902+
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1903+
1904+
Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1905+
MVT::i64, RegSequenceArgs),
1906+
0);
1907+
}
18971908
}
18981909
}
18991910
}

0 commit comments

Comments
 (0)