@@ -1828,72 +1828,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
18281828 isFlatScratchBaseLegal (Addr))) {
18291829 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue ();
18301830
1831- const SIInstrInfo *TII = Subtarget->getInstrInfo ();
1832- if (TII->isLegalFLATOffset (COffsetVal, AS, FlatVariant)) {
1833- Addr = N0;
1834- OffsetVal = COffsetVal;
1835- } else {
1836- // If the offset doesn't fit, put the low bits into the offset field and
1837- // add the rest.
1838- //
1839- // For a FLAT instruction the hardware decides whether to access
1840- // global/scratch/shared memory based on the high bits of vaddr,
1841- // ignoring the offset field, so we have to ensure that when we add
1842- // remainder to vaddr it still points into the same underlying object.
1843- // The easiest way to do that is to make sure that we split the offset
1844- // into two pieces that are both >= 0 or both <= 0.
1845-
1846- SDLoc DL (N);
1847- uint64_t RemainderOffset;
1848-
1849- std::tie (OffsetVal, RemainderOffset) =
1850- TII->splitFlatOffset (COffsetVal, AS, FlatVariant);
1851-
1852- SDValue AddOffsetLo =
1853- getMaterializedScalarImm32 (Lo_32 (RemainderOffset), DL);
1854- SDValue Clamp = CurDAG->getTargetConstant (0 , DL, MVT::i1);
1855-
1856- if (Addr.getValueType ().getSizeInBits () == 32 ) {
1857- SmallVector<SDValue, 3 > Opnds;
1858- Opnds.push_back (N0);
1859- Opnds.push_back (AddOffsetLo);
1860- unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1861- if (Subtarget->hasAddNoCarry ()) {
1862- AddOp = AMDGPU::V_ADD_U32_e64;
1863- Opnds.push_back (Clamp);
1864- }
1865- Addr = SDValue (CurDAG->getMachineNode (AddOp, DL, MVT::i32 , Opnds), 0 );
1831+ // Adding the offset to the base address in a FLAT instruction must not
1832+ // change the memory aperture in which the address falls. Therefore we can
1833+ // only fold offsets from inbounds GEPs into FLAT instructions.
1834+ bool IsInBounds =
1835+ Addr.getOpcode () == ISD::PTRADD && Addr->getFlags ().hasInBounds ();
1836+ if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1837+ const SIInstrInfo *TII = Subtarget->getInstrInfo ();
1838+ if (TII->isLegalFLATOffset (COffsetVal, AS, FlatVariant)) {
1839+ Addr = N0;
1840+ OffsetVal = COffsetVal;
18661841 } else {
1867- // TODO: Should this try to use a scalar add pseudo if the base address
1868- // is uniform and saddr is usable?
1869- SDValue Sub0 = CurDAG->getTargetConstant (AMDGPU::sub0, DL, MVT::i32 );
1870- SDValue Sub1 = CurDAG->getTargetConstant (AMDGPU::sub1, DL, MVT::i32 );
1871-
1872- SDNode *N0Lo = CurDAG->getMachineNode (TargetOpcode::EXTRACT_SUBREG,
1873- DL, MVT::i32 , N0, Sub0);
1874- SDNode *N0Hi = CurDAG->getMachineNode (TargetOpcode::EXTRACT_SUBREG,
1875- DL, MVT::i32 , N0, Sub1);
1876-
1877- SDValue AddOffsetHi =
1878- getMaterializedScalarImm32 (Hi_32 (RemainderOffset), DL);
1879-
1880- SDVTList VTs = CurDAG->getVTList (MVT::i32 , MVT::i1);
1881-
1882- SDNode *Add =
1883- CurDAG->getMachineNode (AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1884- {AddOffsetLo, SDValue (N0Lo, 0 ), Clamp});
1885-
1886- SDNode *Addc = CurDAG->getMachineNode (
1887- AMDGPU::V_ADDC_U32_e64, DL, VTs,
1888- {AddOffsetHi, SDValue (N0Hi, 0 ), SDValue (Add, 1 ), Clamp});
1889-
1890- SDValue RegSequenceArgs[] = {
1891- CurDAG->getTargetConstant (AMDGPU::VReg_64RegClassID, DL, MVT::i32 ),
1892- SDValue (Add, 0 ), Sub0, SDValue (Addc, 0 ), Sub1};
1893-
1894- Addr = SDValue (CurDAG->getMachineNode (AMDGPU::REG_SEQUENCE, DL,
1895- MVT::i64 , RegSequenceArgs),
1896- 0 );
1842+ // If the offset doesn't fit, put the low bits into the offset field
1843+ // and add the rest.
1844+ //
1845+ // For a FLAT instruction the hardware decides whether to access
1846+ // global/scratch/shared memory based on the high bits of vaddr,
1847+ // ignoring the offset field, so we have to ensure that when we add
1848+ // remainder to vaddr it still points into the same underlying object.
1849+ // The easiest way to do that is to make sure that we split the offset
1850+ // into two pieces that are both >= 0 or both <= 0.
1851+
1852+ SDLoc DL (N);
1853+ uint64_t RemainderOffset;
1854+
1855+ std::tie (OffsetVal, RemainderOffset) =
1856+ TII->splitFlatOffset (COffsetVal, AS, FlatVariant);
1857+
1858+ SDValue AddOffsetLo =
1859+ getMaterializedScalarImm32 (Lo_32 (RemainderOffset), DL);
1860+ SDValue Clamp = CurDAG->getTargetConstant (0 , DL, MVT::i1);
1861+
1862+ if (Addr.getValueType ().getSizeInBits () == 32 ) {
1863+ SmallVector<SDValue, 3 > Opnds;
1864+ Opnds.push_back (N0);
1865+ Opnds.push_back (AddOffsetLo);
1866+ unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1867+ if (Subtarget->hasAddNoCarry ()) {
1868+ AddOp = AMDGPU::V_ADD_U32_e64;
1869+ Opnds.push_back (Clamp);
1870+ }
1871+ Addr =
1872+ SDValue (CurDAG->getMachineNode (AddOp, DL, MVT::i32 , Opnds), 0 );
1873+ } else {
1874+ // TODO: Should this try to use a scalar add pseudo if the base
1875+ // address is uniform and saddr is usable?
1876+ SDValue Sub0 =
1877+ CurDAG->getTargetConstant (AMDGPU::sub0, DL, MVT::i32 );
1878+ SDValue Sub1 =
1879+ CurDAG->getTargetConstant (AMDGPU::sub1, DL, MVT::i32 );
1880+
1881+ SDNode *N0Lo = CurDAG->getMachineNode (TargetOpcode::EXTRACT_SUBREG,
1882+ DL, MVT::i32 , N0, Sub0);
1883+ SDNode *N0Hi = CurDAG->getMachineNode (TargetOpcode::EXTRACT_SUBREG,
1884+ DL, MVT::i32 , N0, Sub1);
1885+
1886+ SDValue AddOffsetHi =
1887+ getMaterializedScalarImm32 (Hi_32 (RemainderOffset), DL);
1888+
1889+ SDVTList VTs = CurDAG->getVTList (MVT::i32 , MVT::i1);
1890+
1891+ SDNode *Add =
1892+ CurDAG->getMachineNode (AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1893+ {AddOffsetLo, SDValue (N0Lo, 0 ), Clamp});
1894+
1895+ SDNode *Addc = CurDAG->getMachineNode (
1896+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
1897+ {AddOffsetHi, SDValue (N0Hi, 0 ), SDValue (Add, 1 ), Clamp});
1898+
1899+ SDValue RegSequenceArgs[] = {
1900+ CurDAG->getTargetConstant (AMDGPU::VReg_64RegClassID, DL,
1901+ MVT::i32 ),
1902+ SDValue (Add, 0 ), Sub0, SDValue (Addc, 0 ), Sub1};
1903+
1904+ Addr = SDValue (CurDAG->getMachineNode (AMDGPU::REG_SEQUENCE, DL,
1905+ MVT::i64 , RegSequenceArgs),
1906+ 0 );
1907+ }
18971908 }
18981909 }
18991910 }
0 commit comments