Skip to content

Commit 4a42fa5

Browse files
committed
[AMDGPU][SDAG] Only fold flat offsets if they are inbounds PTRADDs
For flat memory instructions where the address is supplied as a base address register with an immediate offset, the memory aperture test ignores the immediate offset. Currently, SDISel does not respect that, which leads to miscompilations where valid input programs crash when the address computation relies on the immediate offset to get the base address in the proper memory aperture. Global or scratch instructions are not affected. This patch only selects flat instructions with immediate offsets from PTRADD address computations with the inbounds flag: If the PTRADD does not leave the bounds of the allocated object, it cannot leave the bounds of the memory aperture and is therefore safe to handle with an immediate offset. Affected tests: - CodeGen/AMDGPU/fold-gep-offset.ll: Offsets are no longer wrongly folded, added new positive tests where we still do fold them. - CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll: Offset folding doesn't seem integral to this test, so the test is not changed to make offset folding still happen. - CodeGen/AMDGPU/loop-prefetch-data.ll: loop-reduce transforms inbounds addresses for accesses to be based on potentially OOB addresses used for prefetching. - I think the remaining ones suffer from the limited preservation of the inbounds flag in PTRADD DAGCombines due to the provenance problems pointed out in PR #165424 and the fact that `AMDGPUTargetLowering::SplitVector{Load|Store}` legalizes too-wide accesses by repeatedly splitting them in half. Legalizing a V32S32 memory accesses therefore leads to inbounds ptradd chains like (ptradd inbounds (ptradd inbounds (ptradd inbounds P, 64), 32), 16). The DAGCombines fold them into a single ptradd, but the involved transformations generally cannot preserve the inbounds flag (even though it would be valid in this case). Similar previous PR that relied on `ISD::ADD inbounds` instead of `ISD::PTRADD inbounds` (closed): #132353 Analogous PR for GISel (merged): #153001 Fixes SWDEV-516125.
1 parent 2f2d265 commit 4a42fa5

File tree

8 files changed

+6178
-5256
lines changed

8 files changed

+6178
-5256
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 76 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1828,72 +1828,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
18281828
isFlatScratchBaseLegal(Addr))) {
18291829
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
18301830

1831-
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1832-
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1833-
Addr = N0;
1834-
OffsetVal = COffsetVal;
1835-
} else {
1836-
// If the offset doesn't fit, put the low bits into the offset field and
1837-
// add the rest.
1838-
//
1839-
// For a FLAT instruction the hardware decides whether to access
1840-
// global/scratch/shared memory based on the high bits of vaddr,
1841-
// ignoring the offset field, so we have to ensure that when we add
1842-
// remainder to vaddr it still points into the same underlying object.
1843-
// The easiest way to do that is to make sure that we split the offset
1844-
// into two pieces that are both >= 0 or both <= 0.
1845-
1846-
SDLoc DL(N);
1847-
uint64_t RemainderOffset;
1848-
1849-
std::tie(OffsetVal, RemainderOffset) =
1850-
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1851-
1852-
SDValue AddOffsetLo =
1853-
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1854-
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1855-
1856-
if (Addr.getValueType().getSizeInBits() == 32) {
1857-
SmallVector<SDValue, 3> Opnds;
1858-
Opnds.push_back(N0);
1859-
Opnds.push_back(AddOffsetLo);
1860-
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1861-
if (Subtarget->hasAddNoCarry()) {
1862-
AddOp = AMDGPU::V_ADD_U32_e64;
1863-
Opnds.push_back(Clamp);
1864-
}
1865-
Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1831+
// Adding the offset to the base address in a FLAT instruction must not
1832+
// change the memory aperture in which the address falls. Therefore we can
1833+
// only fold offsets from inbounds GEPs into FLAT instructions.
1834+
bool IsInBounds =
1835+
Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1836+
if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1837+
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1838+
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1839+
Addr = N0;
1840+
OffsetVal = COffsetVal;
18661841
} else {
1867-
// TODO: Should this try to use a scalar add pseudo if the base address
1868-
// is uniform and saddr is usable?
1869-
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1870-
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1871-
1872-
SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1873-
DL, MVT::i32, N0, Sub0);
1874-
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1875-
DL, MVT::i32, N0, Sub1);
1876-
1877-
SDValue AddOffsetHi =
1878-
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1879-
1880-
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1881-
1882-
SDNode *Add =
1883-
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1884-
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1885-
1886-
SDNode *Addc = CurDAG->getMachineNode(
1887-
AMDGPU::V_ADDC_U32_e64, DL, VTs,
1888-
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1889-
1890-
SDValue RegSequenceArgs[] = {
1891-
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1892-
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1893-
1894-
Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1895-
MVT::i64, RegSequenceArgs),
1896-
0);
1842+
// If the offset doesn't fit, put the low bits into the offset field
1843+
// and add the rest.
1844+
//
1845+
// For a FLAT instruction the hardware decides whether to access
1846+
// global/scratch/shared memory based on the high bits of vaddr,
1847+
// ignoring the offset field, so we have to ensure that when we add
1848+
// remainder to vaddr it still points into the same underlying object.
1849+
// The easiest way to do that is to make sure that we split the offset
1850+
// into two pieces that are both >= 0 or both <= 0.
1851+
1852+
SDLoc DL(N);
1853+
uint64_t RemainderOffset;
1854+
1855+
std::tie(OffsetVal, RemainderOffset) =
1856+
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1857+
1858+
SDValue AddOffsetLo =
1859+
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1860+
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1861+
1862+
if (Addr.getValueType().getSizeInBits() == 32) {
1863+
SmallVector<SDValue, 3> Opnds;
1864+
Opnds.push_back(N0);
1865+
Opnds.push_back(AddOffsetLo);
1866+
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1867+
if (Subtarget->hasAddNoCarry()) {
1868+
AddOp = AMDGPU::V_ADD_U32_e64;
1869+
Opnds.push_back(Clamp);
1870+
}
1871+
Addr =
1872+
SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1873+
} else {
1874+
// TODO: Should this try to use a scalar add pseudo if the base
1875+
// address is uniform and saddr is usable?
1876+
SDValue Sub0 =
1877+
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1878+
SDValue Sub1 =
1879+
CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1880+
1881+
SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1882+
DL, MVT::i32, N0, Sub0);
1883+
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1884+
DL, MVT::i32, N0, Sub1);
1885+
1886+
SDValue AddOffsetHi =
1887+
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1888+
1889+
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1890+
1891+
SDNode *Add =
1892+
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1893+
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1894+
1895+
SDNode *Addc = CurDAG->getMachineNode(
1896+
AMDGPU::V_ADDC_U32_e64, DL, VTs,
1897+
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1898+
1899+
SDValue RegSequenceArgs[] = {
1900+
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1901+
MVT::i32),
1902+
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1903+
1904+
Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1905+
MVT::i64, RegSequenceArgs),
1906+
0);
1907+
}
18971908
}
18981909
}
18991910
}

0 commit comments

Comments
 (0)