From 4a42fa54e95445ddc529b07f7c2dda68bf391e64 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Tue, 28 Oct 2025 12:00:42 -0400 Subject: [PATCH] [AMDGPU][SDAG] Only fold flat offsets if they are inbounds PTRADDs For flat memory instructions where the address is supplied as a base address register with an immediate offset, the memory aperture test ignores the immediate offset. Currently, SDISel does not respect that, which leads to miscompilations where valid input programs crash when the address computation relies on the immediate offset to get the base address in the proper memory aperture. Global or scratch instructions are not affected. This patch only selects flat instructions with immediate offsets from PTRADD address computations with the inbounds flag: If the PTRADD does not leave the bounds of the allocated object, it cannot leave the bounds of the memory aperture and is therefore safe to handle with an immediate offset. Affected tests: - CodeGen/AMDGPU/fold-gep-offset.ll: Offsets are no longer wrongly folded, added new positive tests where we still do fold them. - CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll: Offset folding doesn't seem integral to this test, so the test is not changed to make offset folding still happen. - CodeGen/AMDGPU/loop-prefetch-data.ll: loop-reduce transforms inbounds addresses for accesses to be based on potentially OOB addresses used for prefetching. - I think the remaining ones suffer from the limited preservation of the inbounds flag in PTRADD DAGCombines due to the provenance problems pointed out in PR #165424 and the fact that `AMDGPUTargetLowering::SplitVector{Load|Store}` legalizes too-wide accesses by repeatedly splitting them in half. Legalizing a V32S32 memory accesses therefore leads to inbounds ptradd chains like (ptradd inbounds (ptradd inbounds (ptradd inbounds P, 64), 32), 16). The DAGCombines fold them into a single ptradd, but the involved transformations generally cannot preserve the inbounds flag (even though it would be valid in this case). Similar previous PR that relied on `ISD::ADD inbounds` instead of `ISD::PTRADD inbounds` (closed): #132353 Analogous PR for GISel (merged): #153001 Fixes SWDEV-516125. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 141 +- llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll | 915 +- .../AMDGPU/infer-addrspace-flat-atomic.ll | 16 +- .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 49 +- .../CodeGen/AMDGPU/memintrinsic-unroll.ll | 10065 ++++++++-------- llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll | 10 +- .../AMDGPU/no-folding-imm-to-inst-with-fi.ll | 50 +- .../AMDGPU/preserve-wwm-copy-dst-reg.ll | 188 +- 8 files changed, 6178 insertions(+), 5256 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b8b419d93021a..f16eb1649be42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1828,72 +1828,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast(N1)->getSExtValue(); - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - // - // For a FLAT instruction the hardware decides whether to access - // global/scratch/shared memory based on the high bits of vaddr, - // ignoring the offset field, so we have to ensure that when we add - // remainder to vaddr it still points into the same underlying object. - // The easiest way to do that is to make sure that we split the offset - // into two pieces that are both >= 0 or both <= 0. - - SDLoc DL(N); - uint64_t RemainderOffset; - - std::tie(OffsetVal, RemainderOffset) = - TII->splitFlatOffset(COffsetVal, AS, FlatVariant); - - SDValue AddOffsetLo = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - - if (Addr.getValueType().getSizeInBits() == 32) { - SmallVector Opnds; - Opnds.push_back(N0); - Opnds.push_back(AddOffsetLo); - unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { - AddOp = AMDGPU::V_ADD_U32_e64; - Opnds.push_back(Clamp); - } - Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + // Adding the offset to the base address in a FLAT instruction must not + // change the memory aperture in which the address falls. Therefore we can + // only fold offsets from inbounds GEPs into FLAT instructions. + bool IsInBounds = + Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds(); + if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { + Addr = N0; + OffsetVal = COffsetVal; } else { - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); - - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + // If the offset doesn't fit, put the low bits into the offset field + // and add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. + + SDLoc DL(N); + uint64_t RemainderOffset; + + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); + + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = + SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base + // address is uniform and saddr is usable? + SDValue Sub0 = + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, + MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll index 9c49aade6099f..614500287339b 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll @@ -24,96 +24,82 @@ ; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3) -; FIXME the offset here should not be folded: if %p points to the beginning of +; The offset here cannot be folded: if %p points to the beginning of scratch or ; scratch or LDS and %i is -1, a folded offset crashes the program. define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { -; GFX90A-SDAG-LABEL: flat_offset_maybe_oob: -; GFX90A-SDAG: ; %bb.0: -; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 -; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: flat_offset_maybe_oob: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: flat_offset_maybe_oob: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX10-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_offset_maybe_oob: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-SDAG-LABEL: flat_offset_maybe_oob: ; GFX942-SDAG: ; %bb.0: ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 12 +; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1] ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: flat_offset_maybe_oob: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-SDAG-LABEL: flat_offset_maybe_oob: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX12-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12 -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-GISEL-LABEL: flat_offset_maybe_oob: -; GFX90A-GISEL: ; %bb.0: -; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: flat_offset_maybe_oob: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: flat_offset_maybe_oob: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX12-LABEL: flat_offset_maybe_oob: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-GISEL-LABEL: flat_offset_maybe_oob: ; GFX942-GISEL: ; %bb.0: @@ -126,44 +112,6 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX942-GISEL-NEXT: flat_load_dword v0, v[0:1] ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: flat_offset_maybe_oob: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: flat_offset_maybe_oob: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = add nsw i32 %i, 3 %arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx %l = load i32, ptr %arrayidx @@ -273,13 +221,742 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { %l = load i32, ptr addrspace(5) %arrayidx ret i32 %l } + +; If the GEP that adds the offset is inbounds, folding the offset is legal. +define i32 @flat_offset_inbounds(ptr %p, i32 %i) { +; GFX90A-LABEL: flat_offset_inbounds: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_inbounds: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_inbounds: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_inbounds: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_inbounds: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load i32, ptr %arrayidx + ret i32 %l +} + +define void @flat_offset_inbounds_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX90A-SDAG: ; %bb.0: +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v8, vcc, 28, v0 +; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX90A-SDAG-NEXT: flat_load_dword v10, v[8:9] +; GFX90A-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16 +; GFX90A-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: flat_load_dword v8, v[4:5] +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX10-SDAG-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX942-SDAG: ; %bb.0: +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, 28 +; GFX942-SDAG-NEXT: flat_load_dword v10, v[8:9] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: flat_load_b32 v8, v[4:5] +; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX11-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: flat_load_b32 v8, v[4:5] +; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x1 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1] offset:28 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX10-GISEL-NEXT: flat_store_dword v[2:3], v0 offset:16 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX11-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x1 +; GFX12-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <5 x i32>, ptr %arrayidx + store <5 x i32> %l, ptr %pout + ret void +} + +define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-SDAG-MUBUF-LABEL: flat_offset_inbounds_very_wide: +; GFX90A-SDAG-MUBUF: ; %bb.0: +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-SDAG-MUBUF-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v24, s[4:5], 28, v0 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v25, s[4:5], 0, v1, s[4:5] +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v28, s[4:5], 44, v0 +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v36, vcc, 0x8c, v0 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v29, s[4:5], 0, v1, s[4:5] +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[4:7], v[28:29] offset:16 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[8:11], v[28:29] offset:32 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[12:15], v[28:29] offset:48 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[16:19], v[28:29] offset:64 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[20:23], v[28:29] offset:80 +; GFX90A-SDAG-MUBUF-NEXT: s_nop 0 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GFX90A-SDAG-MUBUF-NEXT: s_nop 0 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v1, vcc +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:12 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[48:51], v[36:37] +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v36, vcc, 0x88, v2 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v3, vcc +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:48 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:64 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:64 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:32 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:32 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:16 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[32:35] +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dword v[36:37], v50 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128 +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-SDAG-FLATSCR-LABEL: flat_offset_inbounds_very_wide: +; GFX90A-SDAG-FLATSCR: ; %bb.0: +; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-FLATSCR-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-SDAG-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v24, s[0:1], 28, v0 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v25, s[0:1], 0, v1, s[0:1] +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v28, s[0:1], 44, v0 +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v36, vcc, 0x8c, v0 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v29, s[0:1], 0, v1, s[0:1] +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[4:7], v[28:29] offset:16 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[8:11], v[28:29] offset:32 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[12:15], v[28:29] offset:48 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[16:19], v[28:29] offset:64 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[20:23], v[28:29] offset:80 +; GFX90A-SDAG-FLATSCR-NEXT: s_nop 0 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GFX90A-SDAG-FLATSCR-NEXT: s_nop 0 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v1, vcc +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:12 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[48:51], v[36:37] +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v36, vcc, 0x88, v2 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v3, vcc +; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:48 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:64 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:64 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:32 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:32 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:16 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[32:35] +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dword v[36:37], v50 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128 +; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: s_clause 0x8 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[36:37] offset:80 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[8:11], v[36:37] offset:96 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[36:37] offset:48 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[16:19], v[36:37] offset:64 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[20:23], v[36:37] offset:16 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[24:27], v[36:37] offset:32 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[32:35], v[36:37] +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[36:39], v[36:37] offset:112 +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX10-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:48 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:64 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:32 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dword v[48:49], v38 +; GFX10-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[36:37] offset:128 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX942-SDAG: ; %bb.0: +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x5c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[30:31], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x4c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[16:17], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x7c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[18:19], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x6c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[24:25], v[0:1], 0, 28 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[28:29], v[0:1], 0, 60 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[26:27], v[0:1], 0, 44 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[20:21], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[20:21] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[8:11], v[16:17] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[18:19] +; GFX942-SDAG-NEXT: ; kill: killed $vgpr16_vgpr17 +; GFX942-SDAG-NEXT: ; kill: killed $vgpr18_vgpr19 +; GFX942-SDAG-NEXT: ; kill: killed $vgpr20_vgpr21 +; GFX942-SDAG-NEXT: s_nop 0 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[16:19], v[26:27] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[20:23], v[30:31] +; GFX942-SDAG-NEXT: ; kill: killed $vgpr30_vgpr31 +; GFX942-SDAG-NEXT: ; kill: killed $vgpr26_vgpr27 +; GFX942-SDAG-NEXT: s_nop 0 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GFX942-SDAG-NEXT: s_nop 0 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GFX942-SDAG-NEXT: s_nop 0 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:12 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x8c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[52:55], v[0:1] +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x60 +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x70 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0x50 +; GFX942-SDAG-NEXT: s_mov_b64 s[6:7], 0x88 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, s[0:1] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[2:3], 0, s[2:3] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[38:39], v[2:3], 0, s[4:5] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[48:49], v[2:3], 0, 48 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[2:3], 0, s[6:7] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[12:15] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:64 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[20:23] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:32 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[48:49], v[28:31] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[32:35] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:16 +; GFX942-SDAG-NEXT: flat_store_dword v[50:51], v54 +; GFX942-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[52:53] offset:128 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: s_clause 0x7 +; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80 +; GFX11-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96 +; GFX11-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64 +; GFX11-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32 +; GFX11-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16 +; GFX11-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37] +; GFX11-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX11-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48 +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX11-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo +; GFX11-SDAG-NEXT: s_clause 0x7 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19] +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16 +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX11-SDAG-NEXT: flat_store_b32 v[48:49], v34 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64 +; GFX11-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo +; GFX12-SDAG-NEXT: s_clause 0x7 +; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80 +; GFX12-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96 +; GFX12-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64 +; GFX12-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32 +; GFX12-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16 +; GFX12-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37] +; GFX12-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX12-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX12-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo +; GFX12-SDAG-NEXT: s_clause 0x7 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19] +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX12-SDAG-NEXT: flat_store_b32 v[48:49], v34 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x8 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64 +; GFX12-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 +; GFX90A-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 +; GFX90A-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-GISEL-NEXT: s_clause 0x8 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 +; GFX10-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 +; GFX942-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 +; GFX942-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-GISEL-NEXT: s_clause 0x8 +; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28 +; GFX11-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44 +; GFX11-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60 +; GFX11-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76 +; GFX11-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92 +; GFX11-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108 +; GFX11-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124 +; GFX11-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-GISEL-NEXT: s_clause 0x8 +; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28 +; GFX12-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44 +; GFX12-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60 +; GFX12-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76 +; GFX12-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92 +; GFX12-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108 +; GFX12-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124 +; GFX12-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x808 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x708 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x608 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x508 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x408 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x308 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x208 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x108 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x8 +; GFX12-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <35 x i32>, ptr %arrayidx + store <35 x i32> %l, ptr %pout + ret void +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} ; GFX10-GISEL-FLATSCR: {{.*}} ; GFX10-MUBUF: {{.*}} ; GFX10-SDAG-FLATSCR: {{.*}} -; GFX12: {{.*}} -; GFX90A: {{.*}} ; GFX90A-GISEL-FLATSCR: {{.*}} ; GFX90A-MUBUF: {{.*}} -; GFX90A-SDAG-FLATSCR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index bd11b0710fadd..36df710529599 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -13,9 +13,9 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol @@ -40,9 +40,9 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol @@ -71,11 +71,13 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: s_add_u32 s0, s0, -8 ; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: s_add_u32 s0, s0, 1 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:1 +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 4ad161c03f5b7..2ff69d234455f 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -12,18 +12,22 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-NEXT: s_mov_b32 s5, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-NEXT: .LBB0_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 -; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3] ; GFX12-NEXT: s_cbranch_scc1 .LBB0_2 @@ -38,17 +42,20 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SPREFETCH-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-SPREFETCH-NEXT: s_mov_b32 s5, -1 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe -; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 -; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -410,10 +417,14 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX12-NEXT: .LBB4_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-NEXT: flat_load_b128 v[4:7], v[4:5] ; GFX12-NEXT: s_add_co_i32 s0, s0, -1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u32 s0, 0 @@ -448,10 +459,14 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX12-SPREFETCH-NEXT: .LBB4_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[4:5] ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0 @@ -466,15 +481,17 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; ; GFX1250-LABEL: copy_flat_divergent: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 +; GFX1250-NEXT: s_cmp_eq_u32 s2, 0 ; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3 ; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff50 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1] ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1] @@ -482,13 +499,13 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] ; GFX1250-NEXT: .LBB4_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[2:3] ; GFX1250-NEXT: flat_prefetch_b8 v[2:3] scope:SCOPE_SE -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3] -; GFX1250-NEXT: s_add_co_i32 s0, s0, -1 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: s_add_co_i32 s2, s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-NEXT: flat_load_b128 v[4:7], v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[4:7] ; GFX1250-NEXT: s_wait_xcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 02f39e25cb447..5dc70c3de5bd8 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -16,62 +16,71 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: .LBB0_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo +; CHECK-NEXT: s_clause 0x4 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25] +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128 +; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60 +; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48 +; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v24 +; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v25, vcc_lo +; CHECK-NEXT: s_clause 0xa +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[80:81] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %memcpy-split @@ -82,620 +91,655 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB0_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo -; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xa0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xb0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xc0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, 0xd0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, 0xe0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xf0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo +; ALIGNED-NEXT: s_clause 0xe +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] +; ALIGNED-NEXT: flat_load_dwordx4 v[35:38], v[4:5] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53] +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[54:55] +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[64:65] +; ALIGNED-NEXT: flat_load_dwordx4 v[55:58], v[66:67] +; ALIGNED-NEXT: flat_load_dwordx4 v[59:62], v[68:69] +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[70:71] +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v63 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v63 offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v72 offset:124 +; ALIGNED-NEXT: flat_store_byte v[24:25], v72 offset:128 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(19) +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60 +; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v34 offset:40 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:28 +; ALIGNED-NEXT: flat_store_byte v[24:25], v33 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:33 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:24 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:20 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:12 +; ALIGNED-NEXT: flat_store_byte v[24:25], v52 offset:16 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:4 +; ALIGNED-NEXT: flat_store_byte v[24:25], v82 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 +; ALIGNED-NEXT: flat_store_byte v[24:25], v81 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v80 offset:2 +; ALIGNED-NEXT: flat_store_byte v[24:25], v80 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:248 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:244 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:240 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:236 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:237 +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:232 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:228 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:224 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:221 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:205 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:201 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:197 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:193 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:189 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:185 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:181 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:177 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:173 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:169 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:165 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:157 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v72 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:153 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:149 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:145 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:140 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:141 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v81 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:42 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:218 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:214 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:206 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:202 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:198 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:194 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:190 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:186 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:182 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:178 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:174 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:170 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:166 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:162 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:158 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:154 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:150 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:146 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:142 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:124 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -704,25 +748,34 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x10 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -1549,39 +1602,41 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] -; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %memcpy-split @@ -1594,27 +1649,31 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB2_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[8:9], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[8:9], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[8:9], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[8:9], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[8:9], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[8:9], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[8:9], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[8:9], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[8:9], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[8:9], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[8:9], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[8:9], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[8:9], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[8:9], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[8:9], off +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[8:9], off offset:16 +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) @@ -1622,466 +1681,465 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:249 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:245 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:241 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:236 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:246 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 +; ALIGNED-NEXT: flat_store_byte v[86:87], v116 offset:244 +; ALIGNED-NEXT: flat_store_byte v[86:87], v117 offset:242 +; ALIGNED-NEXT: flat_store_byte v[86:87], v118 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v119 offset:238 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:232 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:233 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:229 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:224 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:225 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:220 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:222 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:217 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:213 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:209 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:212 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:201 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:197 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:193 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:204 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:190 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:185 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:181 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:177 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:186 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:180 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:178 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:174 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:165 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:169 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:157 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:162 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:153 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:149 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:145 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:141 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:148 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:146 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:142 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:134 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:138 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:130 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:126 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:109 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:118 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:122 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:116 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:110 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:105 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:94 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:104 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:102 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:106 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:100 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:98 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:89 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:77 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:90 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:84 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v36 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:78 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[86:87], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:62 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:60 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:58 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[86:87], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:46 +; ALIGNED-NEXT: flat_store_byte v[86:87], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:42 +; ALIGNED-NEXT: flat_store_byte v[86:87], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:18 +; ALIGNED-NEXT: flat_store_byte v[86:87], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB2_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) @@ -3599,38 +3657,38 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 ; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 ; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 -; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 -; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 -; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 -; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:160 ; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 ; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 ; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 @@ -3651,29 +3709,31 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo +; CHECK-NEXT: s_waitcnt vmcnt(43) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(40) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10] ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 @@ -3756,21 +3816,21 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:34 ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:39 ; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:42 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50 @@ -3779,12 +3839,12 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65 @@ -3835,20 +3895,20 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19 ; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 ; ALIGNED-NEXT: s_waitcnt vmcnt(37) -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17 ; ALIGNED-NEXT: s_waitcnt vmcnt(35) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22 ; ALIGNED-NEXT: s_waitcnt vmcnt(33) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 +; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27 ; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(29) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -3861,20 +3921,20 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 @@ -3898,7 +3958,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) @@ -3909,22 +3969,22 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill @@ -3933,11 +3993,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill @@ -4205,7 +4265,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -4236,17 +4296,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 @@ -4261,302 +4321,309 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:159 ; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v120, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v110 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:164 ; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v104, 8, v108 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v105 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:178 ; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:181 ; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:188 ; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:191 ; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: s_clause 0x3c +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:193 ; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:196 ; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:205 ; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:207 ; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:217 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:220 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:226 ; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:229 ; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:237 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 ; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 ; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 ; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 ; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:241 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:244 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:252 ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: s_waitcnt vmcnt(59) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115 +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v100, 8, v101 +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v97 +; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v86, 8, v96 +; ALIGNED-NEXT: v_lshl_or_b32 v107, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v99, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v79, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v83, 8, v84 +; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v14, 8, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v72, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v70 ; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen ; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 -; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71 -; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v53 ; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64 -; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48 -; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33 -; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v114, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v38, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v98, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v81, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v29 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 -; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 -; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v48, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v16, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshl_or_b32 v6, v7, 8, v8 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v89, v9, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v89, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v90, v90, 8, v121 +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v77, 8, v121 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v109, 8, v106 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v122, 8, v111 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0 -; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v89, v127, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v90, v106, 8, v90 +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v111, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v121, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v126, v73, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v126, v89, 8, v106 ; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4565,84 +4632,86 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v0, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:240 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:224 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v3, 3 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v4, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:247 +; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248 +; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:246 +; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:252 +; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:250 +; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:251 +; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:249 +; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:245 +; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:239 +; ALIGNED-NEXT: flat_store_byte v[5:6], v16 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:238 +; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:244 +; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:242 +; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:243 +; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:241 +; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:237 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:231 +; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:232 +; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:230 +; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:236 +; ALIGNED-NEXT: flat_store_byte v[5:6], v27 offset:234 +; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:235 +; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:233 +; ALIGNED-NEXT: flat_store_byte v[5:6], v24 offset:229 +; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:223 +; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:222 +; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:228 +; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:227 +; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:225 +; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:221 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:208 -; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:210 +; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:212 +; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:206 +; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:207 +; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:211 +; ALIGNED-NEXT: flat_store_byte v[5:6], v80 offset:209 +; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:215 +; ALIGNED-NEXT: flat_store_byte v[5:6], v52 offset:216 +; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:214 +; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:220 +; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:218 +; ALIGNED-NEXT: flat_store_byte v[5:6], v54 offset:219 +; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:217 +; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:213 +; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:205 +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:199 +; ALIGNED-NEXT: flat_store_byte v[5:6], v85 offset:200 +; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:198 +; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:204 +; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:203 +; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:201 +; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:197 +; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:191 +; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:192 +; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:190 +; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:194 +; ALIGNED-NEXT: flat_store_byte v[5:6], v102 offset:195 +; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:193 +; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:189 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4656,22 +4725,22 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176 +; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:183 +; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:184 +; ALIGNED-NEXT: flat_store_byte v[5:6], v118 offset:182 +; ALIGNED-NEXT: flat_store_byte v[5:6], v41 offset:188 +; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:186 +; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:187 +; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:185 +; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:181 +; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:175 +; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:176 +; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:174 +; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:180 +; ALIGNED-NEXT: flat_store_byte v[5:6], v57 offset:178 +; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:179 +; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:177 +; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:173 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 @@ -4684,22 +4753,22 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160 +; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:167 +; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:168 +; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:166 +; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:172 +; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:170 +; ALIGNED-NEXT: flat_store_byte v[5:6], v76 offset:171 +; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:169 +; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:165 +; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:159 +; ALIGNED-NEXT: flat_store_byte v[5:6], v92 offset:160 +; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:158 +; ALIGNED-NEXT: flat_store_byte v[5:6], v91 offset:164 +; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:162 +; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:163 +; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:161 +; ALIGNED-NEXT: flat_store_byte v[5:6], v108 offset:157 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 @@ -4712,44 +4781,44 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 +; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:152 +; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:150 +; ALIGNED-NEXT: flat_store_byte v[5:6], v125 offset:156 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153 +; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 @@ -4764,49 +4833,49 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:135 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:136 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:134 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:138 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:139 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:137 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:133 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:127 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:132 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:130 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:131 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:129 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 @@ -4824,52 +4893,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:119 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:120 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:118 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:124 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:122 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:123 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:121 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:117 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:111 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:112 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:110 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:116 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:114 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:115 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:113 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:109 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 @@ -4884,52 +4953,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:103 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:104 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:102 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:108 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:106 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:107 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:105 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:101 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:95 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:96 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:94 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:100 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:98 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:99 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:97 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:93 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 @@ -4944,52 +5013,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:87 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:88 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:86 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:92 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:90 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:91 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:89 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:85 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:79 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:80 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:78 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:84 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:82 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:83 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:81 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:77 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 @@ -5004,49 +5073,49 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:71 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:70 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:76 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:74 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:69 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:62 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:68 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:66 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:67 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:65 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 @@ -5064,52 +5133,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:58 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:55 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:54 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:59 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:57 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:53 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:50 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:47 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:48 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:46 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:52 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:51 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:49 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:45 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 @@ -5124,56 +5193,56 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:40 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:39 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:38 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:37 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:44 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:43 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:42 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:41 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:32 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:31 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:30 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:36 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:35 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:34 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:33 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload @@ -5182,46 +5251,46 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:23 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:24 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:22 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:28 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:27 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:25 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:18 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:15 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:17 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16 +; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:14 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:19 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:16 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:16 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 @@ -5231,42 +5300,42 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 +; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:7 +; ALIGNED-NEXT: flat_store_byte v[5:6], v121 offset:8 +; ALIGNED-NEXT: flat_store_byte v[5:6], v127 offset:10 +; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:6 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:9 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:15 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:12 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9 ; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload @@ -5399,62 +5468,68 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v52, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v53, null, s5, v3, vcc_lo +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[52:53] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[52:53] +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v52, 48 +; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v53, vcc_lo +; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v52, 0x60 +; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v53, vcc_lo +; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v52 +; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v53, vcc_lo +; CHECK-NEXT: s_clause 0xd +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[52:53] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[52:53] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[80:81] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[52:53] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo +; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_cbranch_scc1 .LBB5_2 ; CHECK-NEXT: .LBB5_3: ; %Flow5 ; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -5465,62 +5540,71 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_mov_b32 s7, -1 ; CHECK-NEXT: .LBB5_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo +; CHECK-NEXT: s_clause 0x4 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25] +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128 +; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60 +; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48 +; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v24 +; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v25, vcc_lo +; CHECK-NEXT: s_clause 0xa +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[80:81] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 ; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_cbranch_scc0 .LBB5_5 ; CHECK-NEXT: .LBB5_6: ; %Flow6 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 @@ -5530,14 +5614,23 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-LABEL: memmove_p0_p0_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 @@ -5546,609 +5639,635 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo -; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[20:21] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[22:25], v[20:21] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[20:21] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[20:21] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[20:21] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[98:101], v[20:21] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[20:21] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[20:21] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[20:21] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[20:21] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[20:21] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[50:53], v[20:21] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[20:21] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[20:21] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[20:21] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[20:21] offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v38, vcc_lo, 0xa0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v39, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xb0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xc0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xd0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xe0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v80, vcc_lo, 0xf0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v5, vcc_lo +; ALIGNED-NEXT: s_clause 0xe +; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[4:5] +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[38:39] +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53] +; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[54:55] +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[64:65] +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[70:71] +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[80:81] +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:252 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:248 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:244 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:240 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v56 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v56 offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v57 offset:124 +; ALIGNED-NEXT: flat_store_byte v[24:25], v57 offset:128 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(19) +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:208 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60 +; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:192 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:40 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:28 +; ALIGNED-NEXT: flat_store_byte v[24:25], v27 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:33 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:176 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:24 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:20 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:12 +; ALIGNED-NEXT: flat_store_byte v[24:25], v30 offset:16 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:288 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:4 +; ALIGNED-NEXT: flat_store_byte v[24:25], v36 offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:164 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 +; ALIGNED-NEXT: flat_store_byte v[24:25], v35 offset:4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:160 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v34 offset:2 +; ALIGNED-NEXT: flat_store_byte v[24:25], v34 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:144 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:248 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:244 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:240 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:236 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:237 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:128 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:232 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:228 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:224 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:221 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:112 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:205 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:108 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:201 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:197 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:193 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:96 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:189 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:80 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:185 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v80 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:181 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:177 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:173 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:64 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:169 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:165 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:157 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v57 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v57 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:153 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:149 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:145 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:140 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:141 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v34 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:56 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v33 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v32 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v31 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v30 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:233 +; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v49 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v38 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:42 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v53 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v52 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v65 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v64 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v66 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v81 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v80 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v71 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v85 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:185 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v82 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:218 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:214 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:206 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:202 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:198 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:194 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:190 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:186 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:182 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:178 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:174 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:170 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:166 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:162 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:158 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:154 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:150 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:146 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:142 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:124 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -6157,14 +6276,14 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB5_2 ; ALIGNED-NEXT: .LBB5_3: ; %Flow5 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -6175,609 +6294,635 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB5_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo -; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xa0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xb0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xc0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, 0xd0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, 0xe0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xf0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo +; ALIGNED-NEXT: s_clause 0xe +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] +; ALIGNED-NEXT: flat_load_dwordx4 v[35:38], v[4:5] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53] +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[54:55] +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[64:65] +; ALIGNED-NEXT: flat_load_dwordx4 v[55:58], v[66:67] +; ALIGNED-NEXT: flat_load_dwordx4 v[59:62], v[68:69] +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[70:71] +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:428 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v63 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v63 offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v72 offset:124 +; ALIGNED-NEXT: flat_store_byte v[24:25], v72 offset:128 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(19) +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60 +; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:576 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:584 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:588 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:588 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:580 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v34 offset:40 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:28 +; ALIGNED-NEXT: flat_store_byte v[24:25], v33 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:33 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:540 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:24 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:20 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:12 +; ALIGNED-NEXT: flat_store_byte v[24:25], v52 offset:16 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:4 +; ALIGNED-NEXT: flat_store_byte v[24:25], v82 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 +; ALIGNED-NEXT: flat_store_byte v[24:25], v81 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v80 offset:2 +; ALIGNED-NEXT: flat_store_byte v[24:25], v80 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:248 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:244 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:240 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:236 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:237 +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:232 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:228 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:224 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:221 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:205 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:201 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:197 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:193 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:189 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:185 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:181 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:177 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:173 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:169 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:165 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:157 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v72 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:153 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:149 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:145 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:140 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:141 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:524 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v81 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:42 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:218 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:214 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:206 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:202 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:198 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:194 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:190 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:186 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:182 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:178 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:174 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:170 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:166 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:162 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:158 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:154 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:150 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:146 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:142 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:124 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -6786,26 +6931,35 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 ; ALIGNED-NEXT: .LBB5_6: ; %Flow6 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x10 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -8422,34 +8576,36 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 ; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 ; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) @@ -8488,34 +8644,36 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 ; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 ; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) @@ -8542,11 +8700,11 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224 ; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 ; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 ; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 @@ -8561,473 +8719,477 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v100 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v101 offset:248 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:249 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v99 offset:240 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:241 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v98 offset:236 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:254 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:248 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:246 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:252 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:250 +; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:244 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:252 +; ALIGNED-NEXT: flat_store_byte v[96:97], v117 offset:242 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:248 +; ALIGNED-NEXT: flat_store_byte v[96:97], v118 offset:240 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:246 +; ALIGNED-NEXT: flat_store_byte v[96:97], v119 offset:238 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:240 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v114 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v115 offset:232 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:229 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v113 offset:224 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:225 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v112 offset:220 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:221 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:243 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:217 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:213 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:209 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:212 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:207 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v70 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:201 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:197 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v69 offset:192 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:193 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v68 offset:188 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:204 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:190 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v66 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v67 offset:184 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:185 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:181 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v65 offset:176 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:177 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v64 offset:172 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:188 ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:186 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:180 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:178 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:174 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:165 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:169 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:157 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:164 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:162 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:153 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:149 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:145 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:141 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:148 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:146 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:142 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:136 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:134 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:140 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:138 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:130 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:126 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:109 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:118 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:122 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:116 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:114 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:112 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:110 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:105 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:94 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:104 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:106 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:100 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:98 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:92 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:90 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:89 +; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:78 +; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:77 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:72 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:76 +; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:62 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:56 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:60 +; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:58 +; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:46 +; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:40 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:44 +; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:42 +; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:24 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:12 +; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_2 ; ALIGNED-NEXT: .LBB7_3: ; %Flow6 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -9040,11 +9202,11 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf ; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224 ; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 ; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 ; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 @@ -9059,6 +9221,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) @@ -9066,465 +9232,465 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:428 ; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:420 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:249 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:245 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:241 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:236 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:246 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 +; ALIGNED-NEXT: flat_store_byte v[86:87], v116 offset:244 +; ALIGNED-NEXT: flat_store_byte v[86:87], v117 offset:242 +; ALIGNED-NEXT: flat_store_byte v[86:87], v118 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v119 offset:238 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:232 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:233 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:229 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:224 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:225 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:220 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:222 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:392 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:388 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:217 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:213 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:209 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:212 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:201 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:197 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:193 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:204 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:190 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:488 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:484 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:185 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:181 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:177 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:186 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:180 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:178 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:174 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:165 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:169 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:157 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:162 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:153 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:149 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:145 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:141 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:148 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:146 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:142 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:472 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:134 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:138 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:130 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:126 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:109 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:118 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:122 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:116 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:110 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:105 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:94 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:104 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:102 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:106 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:100 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:98 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:89 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:77 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:90 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:84 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v36 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:78 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[86:87], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:62 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:60 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:58 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[86:87], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:46 +; ALIGNED-NEXT: flat_store_byte v[86:87], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:42 +; ALIGNED-NEXT: flat_store_byte v[86:87], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:18 +; ALIGNED-NEXT: flat_store_byte v[86:87], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_5 ; ALIGNED-NEXT: .LBB7_6: ; %Flow7 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 @@ -12503,25 +12669,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:224 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:176 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10] ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 @@ -12607,29 +12775,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10] ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 @@ -12736,18 +12906,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50 ; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:51 ; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 @@ -12823,21 +12993,21 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(25) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(23) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v34, 8, v31 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(21) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v37 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 @@ -12890,17 +13060,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill @@ -12917,7 +13087,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill @@ -12965,37 +13135,37 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 @@ -13102,7 +13272,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -13139,9 +13309,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -13154,7 +13324,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142 @@ -13163,31 +13333,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -13205,577 +13375,586 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v110 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v109 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v95 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v104 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 ; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:175 ; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v90 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v79, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v74 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:178 ; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:181 ; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v59 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v62 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v72 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:188 ; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:191 ; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v45, 8, v46 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v42, 8, v43 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v40 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:193 ; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:196 ; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:205 ; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:207 ; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:217 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:220 ; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:226 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:229 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:237 ; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:233 ; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:241 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 ; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:244 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 ; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:247 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:254 ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 -; ALIGNED-NEXT: s_waitcnt vmcnt(61) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(59) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v116, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v113 +; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v101 +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v98, 8, v100 +; ALIGNED-NEXT: v_lshl_or_b32 v108, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v103, 8, v114 ; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v96 +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v86 +; ALIGNED-NEXT: v_lshl_or_b32 v92, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v15, 8, v20 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v13 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v121, v9, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v70, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v82 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v83 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v81 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v66 +; ALIGNED-NEXT: v_lshl_or_b32 v118, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v51 +; ALIGNED-NEXT: v_lshl_or_b32 v102, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v35 +; ALIGNED-NEXT: v_lshl_or_b32 v85, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v80, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v31, 8, v33 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 -; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v54, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 -; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v52, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v23 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 -; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 +; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v22 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v24, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v14 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v16, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v10 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v121, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v122, v0, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v121, v121, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v121, v5, 8, v124 +; ALIGNED-NEXT: v_lshl_or_b32 v122, v4, 8, v125 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v121, v4, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v122, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_mov_b32_e32 v4, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v122, v5, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v5, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v121, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v95, 8, v125 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v4, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:240 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:224 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v127, v1, 8, v125 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_add_co_u32 v121, vcc_lo, v5, s4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v122, null, s5, v6, vcc_lo +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v121, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v122, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:247 +; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248 +; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:246 +; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:252 +; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:250 +; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:251 +; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:249 +; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:245 +; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:239 +; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:238 +; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:244 +; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:242 +; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:243 +; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:241 +; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:237 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:231 +; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:232 +; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:230 +; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:236 +; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:234 +; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:235 +; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:233 +; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:229 +; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:223 +; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:224 +; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:222 +; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:228 +; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:226 +; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:227 +; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:225 +; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:221 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:208 -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:210 +; ALIGNED-NEXT: flat_store_byte v[5:6], v68 offset:212 +; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:206 +; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:207 +; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:211 +; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:209 +; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:215 +; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:216 +; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:214 +; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:220 +; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:218 +; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:219 +; ALIGNED-NEXT: flat_store_byte v[5:6], v81 offset:217 +; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:213 +; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:205 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:199 +; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:200 +; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:198 +; ALIGNED-NEXT: flat_store_byte v[5:6], v98 offset:204 +; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:202 +; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:203 +; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:201 +; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:197 +; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:191 +; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:192 +; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:190 +; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:194 +; ALIGNED-NEXT: flat_store_byte v[5:6], v114 offset:195 +; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:193 +; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:189 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:183 +; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:184 +; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:182 +; ALIGNED-NEXT: flat_store_byte v[5:6], v45 offset:188 +; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:186 +; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:187 +; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:185 +; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:181 +; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:175 +; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:176 +; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:174 +; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:180 +; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:178 +; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:179 +; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:177 +; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:173 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:167 +; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:168 +; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:166 +; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:172 +; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:170 +; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:171 +; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:169 +; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:165 +; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:159 +; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:160 +; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:158 +; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:164 +; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:162 +; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:163 +; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:161 +; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:157 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 +; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:152 +; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:150 +; ALIGNED-NEXT: flat_store_byte v[5:6], v126 offset:156 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:152 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153 +; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:135 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:136 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:134 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:138 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:139 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:137 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:133 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:126 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:132 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:130 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:131 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:129 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 @@ -13790,52 +13969,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:119 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:120 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:118 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:124 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:122 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:123 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:121 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:117 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:111 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:112 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:110 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:116 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:114 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:115 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:113 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:109 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 @@ -13850,112 +14029,112 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:103 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:104 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:102 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:108 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:106 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:107 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:105 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:101 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:95 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:96 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:94 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:100 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:98 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:99 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:97 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:93 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:87 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:88 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:86 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:92 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:90 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:91 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:89 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:85 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:79 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:80 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:78 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:84 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:82 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:83 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:81 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:77 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 @@ -13970,52 +14149,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:71 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:70 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:76 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:74 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:69 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:62 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:68 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:66 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:67 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:65 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 @@ -14028,61 +14207,61 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:58 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:55 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:54 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:59 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:57 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:53 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:50 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:47 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:48 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:46 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:52 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:51 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:49 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:45 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload @@ -14090,52 +14269,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:40 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:39 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:38 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:37 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:44 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:43 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:42 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:41 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:31 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:30 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:36 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:35 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:34 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:33 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 @@ -14148,295 +14327,298 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:23 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:24 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:22 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:28 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:27 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:25 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:17 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16 +; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:14 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:19 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17 +; ALIGNED-NEXT: flat_store_byte v[121:122], v125 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v3 offset:7 +; ALIGNED-NEXT: flat_store_byte v[5:6], v4 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:10 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:6 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:12 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:8 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:8 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:2 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:1 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_1 ; ALIGNED-NEXT: .LBB9_2: ; %Flow10 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB9_5 ; ALIGNED-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0x700, v2 +; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0x700, v2 ; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:39 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:42 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:43 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:47 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:48 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:50 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:51 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 -; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 -; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:70 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: s_clause 0x3a +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v127, v6, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v32, v6, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v37, v6, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v35, v6, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v50, v6, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v48, v6, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v68, v6, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v81, v6, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v124, v6, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v4 +; ALIGNED-NEXT: s_waitcnt vmcnt(43) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19 +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17 +; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22 +; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27 +; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) +; ALIGNED-NEXT: s_waitcnt vmcnt(28) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) +; ALIGNED-NEXT: s_waitcnt vmcnt(26) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(24) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(22) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: s_waitcnt vmcnt(17) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v39 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v49, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v5, v51, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(11) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(11) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: s_waitcnt vmcnt(9) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill @@ -14445,11 +14627,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill @@ -14465,13 +14647,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) @@ -14484,11 +14666,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14499,48 +14681,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14548,13 +14730,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14565,48 +14747,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14614,13 +14796,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14631,48 +14813,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:135 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:133 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14680,13 +14862,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:132 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14697,48 +14879,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:143 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:138 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14746,107 +14928,106 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:159 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:158 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v123, v6, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v122, v6, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v6, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v108, v6, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v6, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v106, v6, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v6, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v6, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v6, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v104 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v106 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v78, v6, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v77, v6, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v74, v6, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v63, v6, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v62, v6, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v61, v6, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v6, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v6, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v6, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v60, v6, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v6, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v6, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v46, v6, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) @@ -14859,11 +15040,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v44, v6, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v6, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v42, v6, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v6, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v6, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -14871,289 +15052,291 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v119, v6, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v6, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v116, v6, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v115, v6, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v6, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v6, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v100, v6, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v6, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v6, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v102, v6, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v6, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v6, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v6, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v6, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v86, v6, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v6, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v84, v6, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v6, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v37, v6, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v6, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v6, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:249 ; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 -; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 -; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v100, 8, v101 ; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 +; ALIGNED-NEXT: v_lshl_or_b32 v107, v2, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v99, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v2, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v18 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v12 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v88, v9, 8, v1 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen +; ALIGNED-NEXT: v_lshl_or_b32 v79, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v120 +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v72, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v70 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v66, 8, v71 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 -; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v117, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 ; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v39 ; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 ; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v23, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v22 ; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 -; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v2, v19, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v16, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v11, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 8, v8 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v88, 16, v5 +; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v62, v62, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v76, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v88, v88, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v109, 8, v93 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v121, 8, v110 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v125, v6, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v88, v125, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v6, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v89, v126, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v110, 8, v121 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v120, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 +; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0xffffff00, v6 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v88 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v125, v76, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v89, 8, v93 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v3, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[2:3], v1 offset:250 -; ALIGNED-NEXT: flat_store_byte v[2:3], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:249 -; ALIGNED-NEXT: flat_store_byte v[2:3], v8 offset:255 -; ALIGNED-NEXT: flat_store_byte v[2:3], v9 offset:253 -; ALIGNED-NEXT: flat_store_byte v[2:3], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[2:3], v11 offset:252 -; ALIGNED-NEXT: flat_store_byte v[2:3], v6 offset:248 -; ALIGNED-NEXT: flat_store_byte v[2:3], v13 offset:242 -; ALIGNED-NEXT: flat_store_byte v[2:3], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[2:3], v17 offset:241 -; ALIGNED-NEXT: flat_store_byte v[2:3], v12 offset:247 -; ALIGNED-NEXT: flat_store_byte v[2:3], v15 offset:245 -; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 -; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v3, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[4:5], v1 offset:247 +; ALIGNED-NEXT: flat_store_byte v[4:5], v9 offset:248 +; ALIGNED-NEXT: flat_store_byte v[4:5], v7 offset:246 +; ALIGNED-NEXT: flat_store_byte v[4:5], v10 offset:252 +; ALIGNED-NEXT: flat_store_byte v[4:5], v11 offset:250 +; ALIGNED-NEXT: flat_store_byte v[4:5], v12 offset:251 +; ALIGNED-NEXT: flat_store_byte v[4:5], v13 offset:249 +; ALIGNED-NEXT: flat_store_byte v[4:5], v8 offset:245 +; ALIGNED-NEXT: flat_store_byte v[4:5], v15 offset:239 +; ALIGNED-NEXT: flat_store_byte v[4:5], v16 offset:240 +; ALIGNED-NEXT: flat_store_byte v[4:5], v19 offset:238 +; ALIGNED-NEXT: flat_store_byte v[4:5], v14 offset:244 +; ALIGNED-NEXT: flat_store_byte v[4:5], v17 offset:242 +; ALIGNED-NEXT: flat_store_byte v[4:5], v18 offset:243 +; ALIGNED-NEXT: flat_store_byte v[4:5], v20 offset:241 +; ALIGNED-NEXT: flat_store_byte v[4:5], v21 offset:237 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:233 -; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:239 -; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:237 -; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:238 -; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:236 -; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:226 -; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:227 -; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:225 -; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:231 -; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:229 -; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:230 -; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:228 -; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:224 +; ALIGNED-NEXT: flat_store_byte v[4:5], v22 offset:231 +; ALIGNED-NEXT: flat_store_byte v[4:5], v25 offset:232 +; ALIGNED-NEXT: flat_store_byte v[4:5], v23 offset:230 +; ALIGNED-NEXT: flat_store_byte v[4:5], v26 offset:236 +; ALIGNED-NEXT: flat_store_byte v[4:5], v27 offset:234 +; ALIGNED-NEXT: flat_store_byte v[4:5], v28 offset:235 +; ALIGNED-NEXT: flat_store_byte v[4:5], v29 offset:233 +; ALIGNED-NEXT: flat_store_byte v[4:5], v24 offset:229 +; ALIGNED-NEXT: flat_store_byte v[4:5], v31 offset:223 +; ALIGNED-NEXT: flat_store_byte v[4:5], v32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[4:5], v35 offset:222 +; ALIGNED-NEXT: flat_store_byte v[4:5], v30 offset:228 +; ALIGNED-NEXT: flat_store_byte v[4:5], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[4:5], v34 offset:227 +; ALIGNED-NEXT: flat_store_byte v[4:5], v36 offset:225 +; ALIGNED-NEXT: flat_store_byte v[4:5], v37 offset:221 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:448 ; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:456 ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:213 -; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:215 -; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:209 -; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:211 -; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:210 -; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:214 -; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:212 -; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:218 -; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:219 -; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:217 -; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:223 -; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:221 -; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:222 -; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:220 -; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:216 -; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:208 -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: flat_store_byte v[4:5], v67 offset:210 +; ALIGNED-NEXT: flat_store_byte v[4:5], v64 offset:212 +; ALIGNED-NEXT: flat_store_byte v[4:5], v38 offset:206 +; ALIGNED-NEXT: flat_store_byte v[4:5], v65 offset:208 +; ALIGNED-NEXT: flat_store_byte v[4:5], v39 offset:207 +; ALIGNED-NEXT: flat_store_byte v[4:5], v70 offset:211 +; ALIGNED-NEXT: flat_store_byte v[4:5], v80 offset:209 +; ALIGNED-NEXT: flat_store_byte v[4:5], v53 offset:215 +; ALIGNED-NEXT: flat_store_byte v[4:5], v52 offset:216 +; ALIGNED-NEXT: flat_store_byte v[4:5], v66 offset:214 +; ALIGNED-NEXT: flat_store_byte v[4:5], v51 offset:220 +; ALIGNED-NEXT: flat_store_byte v[4:5], v55 offset:218 +; ALIGNED-NEXT: flat_store_byte v[4:5], v54 offset:219 +; ALIGNED-NEXT: flat_store_byte v[4:5], v69 offset:217 +; ALIGNED-NEXT: flat_store_byte v[4:5], v71 offset:213 +; ALIGNED-NEXT: flat_store_byte v[4:5], v49 offset:205 +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 -; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 -; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 -; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:207 -; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:205 -; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:206 -; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:204 -; ALIGNED-NEXT: flat_store_byte v[2:3], v84 offset:200 -; ALIGNED-NEXT: flat_store_byte v[2:3], v101 offset:194 -; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:195 -; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:193 -; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:199 -; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:197 -; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 -; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 -; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[4:5], v82 offset:199 +; ALIGNED-NEXT: flat_store_byte v[4:5], v85 offset:200 +; ALIGNED-NEXT: flat_store_byte v[4:5], v83 offset:198 +; ALIGNED-NEXT: flat_store_byte v[4:5], v86 offset:204 +; ALIGNED-NEXT: flat_store_byte v[4:5], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[4:5], v96 offset:203 +; ALIGNED-NEXT: flat_store_byte v[4:5], v97 offset:201 +; ALIGNED-NEXT: flat_store_byte v[4:5], v84 offset:197 +; ALIGNED-NEXT: flat_store_byte v[4:5], v101 offset:191 +; ALIGNED-NEXT: flat_store_byte v[4:5], v100 offset:192 +; ALIGNED-NEXT: flat_store_byte v[4:5], v112 offset:190 +; ALIGNED-NEXT: flat_store_byte v[4:5], v99 offset:196 +; ALIGNED-NEXT: flat_store_byte v[4:5], v103 offset:194 +; ALIGNED-NEXT: flat_store_byte v[4:5], v102 offset:195 +; ALIGNED-NEXT: flat_store_byte v[4:5], v113 offset:193 +; ALIGNED-NEXT: flat_store_byte v[4:5], v115 offset:189 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 @@ -15169,22 +15352,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 -; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 -; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 -; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 -; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:191 -; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:189 -; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:190 -; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:188 -; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:184 -; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:178 -; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:179 -; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:177 -; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:183 -; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:181 -; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 -; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 -; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 +; ALIGNED-NEXT: flat_store_byte v[4:5], v116 offset:183 +; ALIGNED-NEXT: flat_store_byte v[4:5], v40 offset:184 +; ALIGNED-NEXT: flat_store_byte v[4:5], v118 offset:182 +; ALIGNED-NEXT: flat_store_byte v[4:5], v41 offset:188 +; ALIGNED-NEXT: flat_store_byte v[4:5], v43 offset:186 +; ALIGNED-NEXT: flat_store_byte v[4:5], v42 offset:187 +; ALIGNED-NEXT: flat_store_byte v[4:5], v44 offset:185 +; ALIGNED-NEXT: flat_store_byte v[4:5], v119 offset:181 +; ALIGNED-NEXT: flat_store_byte v[4:5], v47 offset:175 +; ALIGNED-NEXT: flat_store_byte v[4:5], v56 offset:176 +; ALIGNED-NEXT: flat_store_byte v[4:5], v59 offset:174 +; ALIGNED-NEXT: flat_store_byte v[4:5], v46 offset:180 +; ALIGNED-NEXT: flat_store_byte v[4:5], v57 offset:178 +; ALIGNED-NEXT: flat_store_byte v[4:5], v58 offset:179 +; ALIGNED-NEXT: flat_store_byte v[4:5], v60 offset:177 +; ALIGNED-NEXT: flat_store_byte v[4:5], v61 offset:173 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 @@ -15197,23 +15380,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 -; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 -; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 -; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 -; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:175 -; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:173 -; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:174 -; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:172 -; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:168 -; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:162 -; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:163 -; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:161 -; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:167 -; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:165 -; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 -; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 -; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v62 offset:167 +; ALIGNED-NEXT: flat_store_byte v[4:5], v74 offset:168 +; ALIGNED-NEXT: flat_store_byte v[4:5], v63 offset:166 +; ALIGNED-NEXT: flat_store_byte v[4:5], v75 offset:172 +; ALIGNED-NEXT: flat_store_byte v[4:5], v77 offset:170 +; ALIGNED-NEXT: flat_store_byte v[4:5], v76 offset:171 +; ALIGNED-NEXT: flat_store_byte v[4:5], v78 offset:169 +; ALIGNED-NEXT: flat_store_byte v[4:5], v73 offset:165 +; ALIGNED-NEXT: flat_store_byte v[4:5], v94 offset:159 +; ALIGNED-NEXT: flat_store_byte v[4:5], v92 offset:160 +; ALIGNED-NEXT: flat_store_byte v[4:5], v105 offset:158 +; ALIGNED-NEXT: flat_store_byte v[4:5], v91 offset:164 +; ALIGNED-NEXT: flat_store_byte v[4:5], v95 offset:162 +; ALIGNED-NEXT: flat_store_byte v[4:5], v104 offset:163 +; ALIGNED-NEXT: flat_store_byte v[4:5], v106 offset:161 +; ALIGNED-NEXT: flat_store_byte v[4:5], v108 offset:157 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload @@ -15225,44 +15408,46 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v111 offset:151 +; ALIGNED-NEXT: flat_store_byte v[4:5], v124 offset:152 +; ALIGNED-NEXT: flat_store_byte v[4:5], v122 offset:150 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:156 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:154 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:158 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:155 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[2:3], v123 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:153 +; ALIGNED-NEXT: flat_store_byte v[4:5], v123 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:147 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:144 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:148 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:145 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:144 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:141 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 @@ -15277,49 +15462,49 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:138 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:135 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:139 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:136 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:137 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:134 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:143 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:141 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:138 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:142 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:139 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:140 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:137 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:136 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:133 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:130 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:127 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:131 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:129 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:135 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:132 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:133 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:130 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:134 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:131 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:132 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:129 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:128 @@ -15337,52 +15522,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:122 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:119 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:123 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:120 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:121 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:118 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:127 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:124 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:125 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:122 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:126 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:123 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:124 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:121 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:120 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:117 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:114 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:111 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:115 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:112 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:113 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:110 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:119 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:116 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:117 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:114 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:118 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:115 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:116 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:113 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:112 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:109 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 @@ -15397,52 +15582,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:106 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:103 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:107 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:104 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:105 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:102 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:111 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:108 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:109 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:106 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:110 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:107 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:108 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:105 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:104 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:101 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:98 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:95 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:99 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:96 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:97 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:94 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:103 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:100 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:101 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:98 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:102 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:99 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:100 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:97 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:96 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:93 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 @@ -15457,52 +15642,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:90 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:87 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:91 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:88 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:89 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:86 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:95 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:92 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:93 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:90 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:94 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:91 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:92 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:89 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:88 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:85 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:82 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:79 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:83 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:80 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:81 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:78 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:87 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:84 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:85 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:82 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:86 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:83 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:84 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:81 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:80 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:77 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 @@ -15517,49 +15702,49 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:74 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:71 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:75 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:73 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:70 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:79 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:76 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:77 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:74 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:78 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:76 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:72 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:69 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:66 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:67 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:65 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:62 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:71 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:68 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:69 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:66 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:70 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:67 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:68 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:65 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:64 @@ -15577,52 +15762,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:61 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:58 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:58 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:55 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:59 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:57 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:54 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:63 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:59 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:62 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:57 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:56 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:53 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:53 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:50 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:50 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:47 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:51 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:48 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:49 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:46 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:55 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:52 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:54 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:51 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:52 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:49 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:48 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:45 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 @@ -15637,149 +15822,147 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:43 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:40 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:39 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:41 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:38 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:47 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:37 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:46 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:44 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:43 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:45 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:42 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:41 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:32 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:34 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:31 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:33 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:30 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:39 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:36 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:38 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:35 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:37 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:34 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:33 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640 +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:640 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:23 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:27 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:24 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:25 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:22 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:31 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:28 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:29 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:30 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:27 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:28 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:25 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:18 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[4:5], v88 offset:15 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:17 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:16 +; ALIGNED-NEXT: flat_store_byte v[4:5], v89 offset:14 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:23 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:20 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:22 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:19 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:16 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:17 +; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:16 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 +; ALIGNED-NEXT: flat_store_byte v[4:5], v109 offset:7 +; ALIGNED-NEXT: flat_store_byte v[4:5], v120 offset:8 +; ALIGNED-NEXT: flat_store_byte v[4:5], v125 offset:10 +; ALIGNED-NEXT: flat_store_byte v[4:5], v110 offset:6 +; ALIGNED-NEXT: flat_store_byte v[4:5], v126 offset:12 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:11 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:12 -; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:9 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:3 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll index 9713689217cf7..db82530f66aa4 100644 --- a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll @@ -49,16 +49,18 @@ define <3 x i64> @v3_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dword v4, v[0:1] offset:20 -; CHECK-NEXT: flat_load_dword v6, v[2:3] offset:16 +; CHECK-NEXT: flat_load_dword v4, v[2:3] offset:16 ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] -; CHECK-NEXT: v_mov_b32_e32 v1, -1 ; CHECK-NEXT: v_mov_b32_e32 v3, -1 +; CHECK-NEXT: flat_load_dword v1, v[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v4, v6, v4 ; CHECK-NEXT: v_ashrrev_i32_e32 v0, v8, v5 ; CHECK-NEXT: v_ashrrev_i32_e32 v2, v10, v7 +; CHECK-NEXT: v_ashrrev_i32_e32 v4, v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, -1 ; CHECK-NEXT: v_mov_b32_e32 v5, -1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %val = load <3 x i64>, ptr %arg0.ptr, !range !4, !noundef !{} diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll index 7e4be65898b65..587e454da884c 100644 --- a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll +++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll @@ -11,63 +11,73 @@ define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %v ; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base ; CHECK-NEXT: s_movk_i32 s34, 0x80 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v21, s35 +; CHECK-NEXT: s_add_nc_u64 s[44:45], s[34:35], 0x70 +; CHECK-NEXT: v_dual_mov_b32 v26, s34 :: v_dual_mov_b32 v27, s35 +; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45 ; CHECK-NEXT: s_wait_kmcnt 0x0 ; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41 ; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 ; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37 ; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39 +; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; CHECK-NEXT: s_add_nc_u64 s[24:25], s[34:35], 0x60 ; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29 ; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31 -; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; CHECK-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21 +; CHECK-NEXT: s_add_nc_u64 s[20:21], s[34:35], 0x50 +; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v22, s24 :: v_dual_mov_b32 v23, s25 ; CHECK-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20 ; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:112 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:96 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] offset:80 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[24:25], v[16:19] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; CHECK-NEXT: s_add_nc_u64 s[12:13], s[34:35], 48 ; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 ; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 -; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 ; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12 ; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 ; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; CHECK-NEXT: flat_store_b128 v[20:21], v[0:3] offset:64 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] offset:64 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[4:7] offset:48 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:32 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[8:11] offset:32 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:16 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[12:15] offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[16:19] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:96 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:112 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:64 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] offset:64 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:80 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:32 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] offset:32 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:48 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:16 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index f4a9e7e8f2759..75638c5fa8476 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -304,62 +304,72 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: ; use s29 ; GFX906-NEXT: ;;#ASMEND ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload ; GFX906-NEXT: v_readlane_b32 s31, v41, 1 ; GFX906-NEXT: v_readlane_b32 s30, v41, 0 ; GFX906-NEXT: s_mov_b32 s32, s33 ; GFX906-NEXT: v_readlane_b32 s4, v41, 4 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3 +; GFX906-NEXT: s_waitcnt vmcnt(33) +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 0x70, v2 +; GFX906-NEXT: s_waitcnt vmcnt(32) +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[32:35] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 0x60, v2 +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v2 +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX906-NEXT: s_xor_saveexec_b64 s[6:7], -1 @@ -683,57 +693,67 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: ; use s29 ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: s_waitcnt vmcnt(33) +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x70, v2 +; GFX908-NEXT: s_waitcnt vmcnt(32) +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[32:35] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x60, v2 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v2 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, 1 ; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:168