diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9d189515f2bbb..57e99d9719c83 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3776,13 +3776,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16 - // takes VGPR_32_Lo128 operands, so the rewrite would also require - // restricting their register classes. For now just bail out. - if (NewOpc == AMDGPU::V_FMAMK_F16_t16 || - NewOpc == AMDGPU::V_FMAMK_F16_fake16) - return false; - const std::optional SubRegImm = extractSubregFromImm( Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg()); @@ -3807,6 +3800,18 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, removeModOperands(UseMI); UseMI.setDesc(get(NewOpc)); + if (NewOpc == AMDGPU::V_FMAMK_F16_t16 || + NewOpc == AMDGPU::V_FMAMK_F16_fake16) { + auto Tmp = MRI->createVirtualRegister( + NewOpc == AMDGPU::V_FMAMK_F16_t16 ? &AMDGPU::VGPR_16_Lo128RegClass + : &AMDGPU::VGPR_32_Lo128RegClass); + BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()), + UseMI.getDebugLoc(), get(AMDGPU::COPY), + UseMI.getOperand(0).getReg()) + .addReg(Tmp, RegState::Kill); + UseMI.getOperand(0).setReg(Tmp); + } + bool DeleteDef = MRI->use_nodbg_empty(Reg); if (DeleteDef) DefMI.eraseFromParent(); @@ -3854,13 +3859,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16 - // takes VGPR_32_Lo128 operands, so the rewrite would also require - // restricting their register classes. For now just bail out. - if (NewOpc == AMDGPU::V_FMAAK_F16_t16 || - NewOpc == AMDGPU::V_FMAAK_F16_fake16) - return false; - // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. @@ -3880,6 +3878,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // These come before src2. removeModOperands(UseMI); UseMI.setDesc(get(NewOpc)); + + if (NewOpc == AMDGPU::V_FMAAK_F16_t16 || + NewOpc == AMDGPU::V_FMAAK_F16_fake16) { + auto Tmp = MRI->createVirtualRegister( + NewOpc == AMDGPU::V_FMAAK_F16_t16 ? &AMDGPU::VGPR_16_Lo128RegClass + : &AMDGPU::VGPR_32_Lo128RegClass); + BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()), + UseMI.getDebugLoc(), get(AMDGPU::COPY), + UseMI.getOperand(0).getReg()) + .addReg(Tmp, RegState::Kill); + UseMI.getOperand(0).setReg(Tmp); + } + // It might happen that UseMI was commuted // and we now have SGPR as SRC1. If so 2 inlined // constant and SGPR are illegal. diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 272d4b5609dfb..8d9fd3662b760 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -614,9 +614,9 @@ def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (interleave (sequence "VGPR%u_LO16", 0, 127), (sequence "VGPR%u_HI16", 0, 127)))> { + let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 16; let GeneratePressureSet = 0; - let isAllocatable = 0; // This is the base class for VGPR{0..127}_{LO16,HI16}. let BaseClassOrder = 16; diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index bbd493f668847..20db029aad27f 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -428,12 +428,11 @@ define i32 @test_D139469_f16(half %arg) { ; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v2, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x291e +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v1, 0x291e, v0 +; GFX11-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, s0, v0, 0x211e +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v1, v0 ; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -491,12 +490,12 @@ define i32 @test_D139469_f16(half %arg) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v2, v1 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x291e +; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v1, 0x291e, v0 +; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, s0, v0, 0x211e +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v1, v0 ; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/fmamk_fmaak-t16.mir b/llvm/test/CodeGen/AMDGPU/fmamk_fmaak-t16.mir new file mode 100644 index 0000000000000..3eec7f95a976f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmamk_fmaak-t16.mir @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -start-before=greedy,0 -stop-after=virtregrewriter,2 -o - %s | FileCheck %s -check-prefix=GFX11 + +--- +name: v_fmamk_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: v_fmamk_f16 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: renamable $vgpr0_lo16 = nofpexcept V_FMAMK_F16_t16 killed $vgpr0_lo16, 1, killed $vgpr1_hi16, implicit $exec, implicit $mode + ; GFX11-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_lo16 + %0:vgpr_32_lo128 = COPY $vgpr0 + %1:vgpr_32_lo128 = COPY $vgpr1 + %2:vgpr_16_lo128 = nofpexcept V_FMAMK_F16_t16 %0.lo16, 1, %1.hi16, implicit $exec, implicit $mode + S_ENDPGM 0, implicit %2 +... + +--- +name: v_fmaak_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GFX11-LABEL: name: v_fmaak_f16 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: renamable $vgpr0_lo16 = nofpexcept V_FMAAK_F16_t16 killed $vgpr0_lo16, killed $vgpr1_hi16, 1, implicit $exec, implicit $mode + ; GFX11-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0_lo16 + %0:vgpr_32_lo128 = COPY $vgpr0 + %1:vgpr_32_lo128 = COPY $vgpr1 + %2:vgpr_16_lo128 = nofpexcept V_FMAAK_F16_t16 %0.lo16, %1.hi16, 1, implicit $exec, implicit $mode + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index efb55db486489..da0524c2bd93b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -509,8 +509,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; GFX11-DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s8, s0 ; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s9, s1 -; GFX11-DENORM-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 -; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v1, off, s[8:11], 0 +; GFX11-DENORM-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 +; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-DENORM-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, @@ -741,8 +741,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; GFX11-DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s8, s0 ; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s9, s1 -; GFX11-DENORM-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 -; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v1, off, s[8:11], 0 +; GFX11-DENORM-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1 +; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-DENORM-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a,