Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions llvm/include/llvm/CodeGen/SelectionDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -1113,7 +1113,8 @@ class SelectionDAG {
SDValue Mask, SDValue EVL);

/// Returns sum of the base pointer and offset.
/// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default.
/// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by
/// default.
LLVM_ABI SDValue
getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL,
const SDNodeFlags Flags = SDNodeFlags());
Expand All @@ -1123,15 +1124,18 @@ class SelectionDAG {

/// Create an add instruction with appropriate flags when used for
/// addressing some offset of an object. i.e. if a load is split into multiple
/// components, create an add nuw from the base pointer to the offset.
/// components, create an add nuw (or ptradd nuw inbounds) from the base
/// pointer to the offset.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) {
return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
return getMemBasePlusOffset(
Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
}

SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) {
// The object itself can't wrap around the address space, so it shouldn't be
// possible for the adds of the offsets to the split parts to overflow.
return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
return getMemBasePlusOffset(
Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
}

/// Return a new CALLSEQ_START node, that starts new call frame, in which
Expand Down
30 changes: 24 additions & 6 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -5649,17 +5649,35 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// Get a pointer to vector element \p Idx located in memory for a vector of
/// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
/// bounds the returned pointer is unspecified, but will be within the vector
/// bounds.
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
SDValue Index) const;
/// bounds. \p PtrArithFlags can be used to mark that arithmetic within the
/// vector in memory is known to not wrap or to be inbounds.
SDValue getVectorElementPointer(
SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index,
const SDNodeFlags PtrArithFlags = SDNodeFlags()) const;

/// Get a pointer to vector element \p Idx located in memory for a vector of
/// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
/// bounds the returned pointer is unspecified, but will be within the vector
/// bounds. \p VecPtr is guaranteed to point to the beginning of a memory
/// location large enough for the vector.
SDValue getInboundsVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr,
EVT VecVT, SDValue Index) const {
return getVectorElementPointer(DAG, VecPtr, VecVT, Index,
SDNodeFlags::NoUnsignedWrap |
SDNodeFlags::InBounds);
}

/// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located
/// in memory for a vector of type \p VecVT starting at a base address of
/// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the
/// returned pointer is unspecified, but the value returned will be such that
/// the entire subvector would be within the vector bounds.
SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
EVT SubVecVT, SDValue Index) const;
/// the entire subvector would be within the vector bounds. \p PtrArithFlags
/// can be used to mark that arithmetic within the vector in memory is known
/// to not wrap or to be inbounds.
SDValue
getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
EVT SubVecVT, SDValue Index,
const SDNodeFlags PtrArithFlags = SDNodeFlags()) const;

/// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This
/// method accepts integers as its arguments.
Expand Down
26 changes: 23 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2715,6 +2715,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
AddToWorklist(Add.getNode());
// We can't set InBounds even if both original ptradds were InBounds and
// NUW: SDAG usually represents pointers as integers, therefore, the
// matched pattern behaves as if it had implicit casts:
// (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds x, y))), z)
// The outer inbounds ptradd might therefore rely on a provenance that x
// does not have.
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
}
}
Expand All @@ -2740,6 +2746,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
// that.
SDNodeFlags Flags =
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
// We can't set InBounds even if both original ptradds were InBounds and
// NUW: SDAG usually represents pointers as integers, therefore, the
// matched pattern behaves as if it had implicit casts:
// (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds GA, v))), c)
// The outer inbounds ptradd might therefore rely on a provenance that
// GA does not have.
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
AddToWorklist(Inner.getNode());
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
Expand All @@ -2763,8 +2775,13 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);

// If both additions in the original were NUW, reassociation preserves that.
SDNodeFlags ReassocFlags =
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDNodeFlags CommonFlags = N->getFlags() & N1->getFlags();
SDNodeFlags ReassocFlags = CommonFlags & SDNodeFlags::NoUnsignedWrap;
if (CommonFlags.hasNoUnsignedWrap()) {
// If both operations are NUW and the PTRADD is inbounds, the offests are
// both non-negative, so the reassociated PTRADDs are also inbounds.
ReassocFlags |= N->getFlags() & SDNodeFlags::InBounds;
}

if (ZIsConstant != YIsConstant) {
if (YIsConstant)
Expand Down Expand Up @@ -22745,7 +22762,10 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(COffset), DL);
PointerInfo = ST->getPointerInfo().getWithOffset(COffset);
} else {
NewPtr = TLI.getVectorElementPointer(DAG, Ptr, Value.getValueType(), Idx);
// The original DAG loaded the entire vector from memory, so arithmetic
// within it must be inbounds.
NewPtr = TLI.getInboundsVectorElementPointer(DAG, Ptr, Value.getValueType(),
Idx);
}

return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(),
Expand Down
25 changes: 14 additions & 11 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10668,19 +10668,20 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx,
DAG.getConstant(MaxIndex, dl, IdxVT));
}

SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
SDValue VecPtr, EVT VecVT,
SDValue Index) const {
SDValue
TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr,
EVT VecVT, SDValue Index,
const SDNodeFlags PtrArithFlags) const {
return getVectorSubVecPointer(
DAG, VecPtr, VecVT,
EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1),
Index);
Index, PtrArithFlags);
}

SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
SDValue VecPtr, EVT VecVT,
EVT SubVecVT,
SDValue Index) const {
SDValue
TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr,
EVT VecVT, EVT SubVecVT, SDValue Index,
const SDNodeFlags PtrArithFlags) const {
SDLoc dl(Index);
// Make sure the index type is big enough to compute in.
Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());
Expand All @@ -10704,7 +10705,7 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,

Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
DAG.getConstant(EltSize, dl, IdxVT));
return DAG.getMemBasePlusOffset(VecPtr, Index, dl);
return DAG.getMemBasePlusOffset(VecPtr, Index, dl, PtrArithFlags);
}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -12382,8 +12383,10 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT,
!IsFast)
return SDValue();

SDValue NewPtr =
getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);
// The original DAG loaded the entire vector from memory, so arithmetic
// within it must be inbounds.
SDValue NewPtr = getInboundsVectorElementPointer(
DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo);

// We are replacing a vector load with a scalar load. The new load must have
// identical memory op ordering to the original.
Expand Down
141 changes: 76 additions & 65 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1828,72 +1828,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
isFlatScratchBaseLegal(Addr))) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();

const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
Addr = N0;
OffsetVal = COffsetVal;
} else {
// If the offset doesn't fit, put the low bits into the offset field and
// add the rest.
//
// For a FLAT instruction the hardware decides whether to access
// global/scratch/shared memory based on the high bits of vaddr,
// ignoring the offset field, so we have to ensure that when we add
// remainder to vaddr it still points into the same underlying object.
// The easiest way to do that is to make sure that we split the offset
// into two pieces that are both >= 0 or both <= 0.

SDLoc DL(N);
uint64_t RemainderOffset;

std::tie(OffsetVal, RemainderOffset) =
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);

SDValue AddOffsetLo =
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);

if (Addr.getValueType().getSizeInBits() == 32) {
SmallVector<SDValue, 3> Opnds;
Opnds.push_back(N0);
Opnds.push_back(AddOffsetLo);
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
if (Subtarget->hasAddNoCarry()) {
AddOp = AMDGPU::V_ADD_U32_e64;
Opnds.push_back(Clamp);
}
Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
// Adding the offset to the base address in a FLAT instruction must not
// change the memory aperture in which the address falls. Therefore we can
// only fold offsets from inbounds GEPs into FLAT instructions.
bool IsInBounds =
Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
Addr = N0;
OffsetVal = COffsetVal;
} else {
// TODO: Should this try to use a scalar add pseudo if the base address
// is uniform and saddr is usable?
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);

SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
DL, MVT::i32, N0, Sub0);
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
DL, MVT::i32, N0, Sub1);

SDValue AddOffsetHi =
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);

SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);

SDNode *Add =
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});

SDNode *Addc = CurDAG->getMachineNode(
AMDGPU::V_ADDC_U32_e64, DL, VTs,
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});

SDValue RegSequenceArgs[] = {
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};

Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
MVT::i64, RegSequenceArgs),
0);
// If the offset doesn't fit, put the low bits into the offset field
// and add the rest.
//
// For a FLAT instruction the hardware decides whether to access
// global/scratch/shared memory based on the high bits of vaddr,
// ignoring the offset field, so we have to ensure that when we add
// remainder to vaddr it still points into the same underlying object.
// The easiest way to do that is to make sure that we split the offset
// into two pieces that are both >= 0 or both <= 0.

SDLoc DL(N);
uint64_t RemainderOffset;

std::tie(OffsetVal, RemainderOffset) =
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);

SDValue AddOffsetLo =
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);

if (Addr.getValueType().getSizeInBits() == 32) {
SmallVector<SDValue, 3> Opnds;
Opnds.push_back(N0);
Opnds.push_back(AddOffsetLo);
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
if (Subtarget->hasAddNoCarry()) {
AddOp = AMDGPU::V_ADD_U32_e64;
Opnds.push_back(Clamp);
}
Addr =
SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
} else {
// TODO: Should this try to use a scalar add pseudo if the base
// address is uniform and saddr is usable?
SDValue Sub0 =
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
SDValue Sub1 =
CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);

SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
DL, MVT::i32, N0, Sub0);
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
DL, MVT::i32, N0, Sub1);

SDValue AddOffsetHi =
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);

SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);

SDNode *Add =
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});

SDNode *Addc = CurDAG->getMachineNode(
AMDGPU::V_ADDC_U32_e64, DL, VTs,
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});

SDValue RegSequenceArgs[] = {
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
MVT::i32),
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};

Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
MVT::i64, RegSequenceArgs),
0);
}
}
}
}
Expand Down
Loading