From 80498f57b36898f1ea1a3f0fef85740017558fd5 Mon Sep 17 00:00:00 2001 From: Valery Pykhtin Date: Thu, 6 Nov 2025 14:02:24 +0000 Subject: [PATCH] [AMDGPU] Use fmac_f64 in "if (cond) a -= c" --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 16 +- .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 134 +- .../Target/AMDGPU/GCNPreRAOptimizations.cpp | 213 +++- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 6 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 126 +- .../CodeGen/AMDGPU/GlobalISel/fma-cond-sub.ll | 1087 +++++++++++++++++ llvm/test/CodeGen/AMDGPU/fma-cond-sub.ll | 1083 ++++++++++++++++ .../CodeGen/AMDGPU/revert-fma-cond-sub.mir | 236 ++++ 8 files changed, 2887 insertions(+), 14 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fma-cond-sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fma-cond-sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/revert-fma-cond-sub.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 7f00eadbf3f3f..7a3a78b04516f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -130,6 +130,20 @@ def sign_extension_in_reg : GICombineRule< [{ return matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]), (apply [{ applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>; +def cond_sub_to_fma_matchdata : GIDefMatchData<"ConditionalSubToFMAMatchInfo">; + +// Optimize conditional subtraction patterns to FMA: +// result = a - (cond ? c : 0.0) -> fma(select(cond, -1.0, 0.0), c, a) +// result = a + (cond ? -c : 0.0) -> fma(select(cond, -1.0, 0.0), c, a) +// result = a + (-(cond ? c : 0.0)) -> fma(select(cond, -1.0, 0.0), c, a) +// +// Only enabled for f64 when hasFmacF64Inst() is true. +def cond_sub_to_fma : GICombineRule< + (defs root:$fsub_or_fadd, cond_sub_to_fma_matchdata:$matchinfo), + (match (wip_match_opcode G_FSUB, G_FADD):$fsub_or_fadd, + [{ return matchConditionalSubToFMA(*${fsub_or_fadd}, ${matchinfo}); }]), + (apply [{ applyConditionalSubToFMA(*${fsub_or_fadd}, ${matchinfo}); }])>; + // Do the following combines : // fmul x, select(y, A, B) -> fldexp (x, select i32 (y, a, b)) // fmul x, select(y, -A, -B) -> fldexp ((fneg x), select i32 (y, a, b)) @@ -228,7 +242,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64, - binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { + cond_sub_to_fma, binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index e86b4738bed18..90e4c42c9b73b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" @@ -47,6 +48,7 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner { const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; const GCNSubtarget &STI; const SIInstrInfo &TII; + const MachineLoopInfo *MLI; // TODO: Make CombinerHelper methods const. mutable AMDGPUCombinerHelper Helper; @@ -56,7 +58,7 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner { GISelValueTracking &VT, GISelCSEInfo *CSEInfo, const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, const GCNSubtarget &STI, MachineDominatorTree *MDT, - const LegalizerInfo *LI); + const MachineLoopInfo *MLI, const LegalizerInfo *LI); static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; } @@ -113,6 +115,18 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner { // bits are zero extended. bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; + // Match conditional subtraction patterns for FMA optimization + struct ConditionalSubToFMAMatchInfo { + Register Cond; + Register C; + Register A; + }; + + bool matchConditionalSubToFMA(MachineInstr &MI, + ConditionalSubToFMAMatchInfo &MatchInfo) const; + void applyConditionalSubToFMA(MachineInstr &MI, + const ConditionalSubToFMAMatchInfo &MatchInfo) const; + private: #define GET_GICOMBINER_CLASS_MEMBERS #define AMDGPUSubtarget GCNSubtarget @@ -131,9 +145,10 @@ AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, GISelValueTracking &VT, GISelCSEInfo *CSEInfo, const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, - const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) + const GCNSubtarget &STI, MachineDominatorTree *MDT, + const MachineLoopInfo *MLI, const LegalizerInfo *LI) : Combiner(MF, CInfo, TPC, &VT, CSEInfo), RuleConfig(RuleConfig), STI(STI), - TII(*STI.getInstrInfo()), + TII(*STI.getInstrInfo()), MLI(MLI), Helper(Observer, B, /*IsPreLegalize*/ false, &VT, MDT, LI, STI), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "AMDGPUGenPostLegalizeGICombiner.inc" @@ -435,6 +450,112 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64( return false; } +// Match conditional subtraction patterns for FMA optimization. +// +// This function identifies patterns like: +// result = a - (cond ? c : 0.0) +// result = a + (cond ? -c : 0.0) +// result = a + (-(cond ? c : 0.0)) +// +// These can be converted to an efficient FMA: +// result = fma((cond ? -1.0, 0.0), c, a) +// +bool AMDGPUPostLegalizerCombinerImpl::matchConditionalSubToFMA( + MachineInstr &MI, ConditionalSubToFMAMatchInfo &MatchInfo) const { + // Only optimize f64 with FMAC support, and check VOPD constraints. + if (!MLI || !STI.shouldUseConditionalSubToFMAF64()) + return false; + + Register DstReg = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(DstReg); + if (Ty != LLT::scalar(64)) + return false; + + Register A = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + MachineInstr *RHSMI = MRI.getVRegDef(RHS); + if (!RHSMI) + return false; + + // Returns true if SelMI is a valid select with false value = 0.0. + auto matchSelectWithZero = [this, &MI](MachineInstr *SelMI, Register &Cond, + Register &TrueVal) -> bool { + if (!SelMI || SelMI->getOpcode() != TargetOpcode::G_SELECT) + return false; + + // Check if FalseVal is exactly 0.0. + Register FalseVal = SelMI->getOperand(3).getReg(); + auto FalseConst = getFConstantVRegValWithLookThrough(FalseVal, MRI); + if (!FalseConst || !FalseConst->Value.isExactlyValue(0.0)) + return false; + + // Check if TrueVal is not constant. + auto TempTrueVal = SelMI->getOperand(2).getReg(); + auto TrueConst = getAnyConstantVRegValWithLookThrough(TempTrueVal, MRI); + if (TrueConst) + return false; + + // Check if select and the add/sub are in same loop context. + if (MLI->getLoopFor(MI.getParent()) != MLI->getLoopFor(SelMI->getParent())) + return false; + + TrueVal = TempTrueVal; + Cond = SelMI->getOperand(1).getReg(); + return true; + }; + + Register Cond, C; + if (MI.getOpcode() == TargetOpcode::G_FSUB) { + // Pattern: fsub a, (select cond, c, 0.0) + if (matchSelectWithZero(RHSMI, Cond, C)) { + MatchInfo = {Cond, C, A}; + return true; + } + } else if (MI.getOpcode() == TargetOpcode::G_FADD) { + // Pattern 1: fadd a, (fneg (select cond, c, 0.0)) + if (RHSMI->getOpcode() == TargetOpcode::G_FNEG) { + Register SelReg = RHSMI->getOperand(1).getReg(); + MachineInstr *SelMI = MRI.getVRegDef(SelReg); + if (matchSelectWithZero(SelMI, Cond, C)) { + MatchInfo = {Cond, C, A}; + return true; + } + } + + // Pattern 2: fadd a, (select cond, (fneg c), 0.0) + if (matchSelectWithZero(RHSMI, Cond, C)) { + // Check if C is fneg + MachineInstr *CMI = MRI.getVRegDef(C); + if (CMI && CMI->getOpcode() == TargetOpcode::G_FNEG) { + C = CMI->getOperand(1).getReg(); + MatchInfo = {Cond, C, A}; + return true; + } + } + } + return false; +} + +void AMDGPUPostLegalizerCombinerImpl::applyConditionalSubToFMA( + MachineInstr &MI, const ConditionalSubToFMAMatchInfo &MatchInfo) const { + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + + // Build: correction = select cond, -1.0, 0.0 + APFloat MinusOne = APFloat(-1.0); + APFloat Zero = APFloat(0.0); + + Register MinusOneReg = B.buildFConstant(Ty, MinusOne).getReg(0); + Register ZeroReg = B.buildFConstant(Ty, Zero).getReg(0); + Register Correction = + B.buildSelect(Ty, MatchInfo.Cond, MinusOneReg, ZeroReg).getReg(0); + + // Build: result = fma(correction, c, a) + B.buildFMA(Dst, Correction, MatchInfo.C, MatchInfo.A, MI.getFlags()); + + MI.eraseFromParent(); +} + // Pass boilerplate // ================ @@ -467,6 +588,8 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { if (!IsOptNone) { AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } MachineFunctionPass::getAnalysisUsage(AU); } @@ -494,6 +617,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis().getDomTree(); + MachineLoopInfo *MLI = + IsOptNone ? nullptr : &getAnalysis().getLI(); CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); @@ -503,7 +628,7 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { // Legalizer performs DCE, so a full DCE pass is unnecessary. CInfo.EnableFullDCE = false; AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *VT, /*CSEInfo*/ nullptr, - RuleConfig, ST, MDT, LI); + RuleConfig, ST, MDT, MLI, LI); return Impl.combineMachineInstrs(); } @@ -513,6 +638,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after legalization", false, false) diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 62172a0bb89db..aaebd918552a2 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -36,6 +36,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" @@ -45,17 +46,33 @@ using namespace llvm; namespace { +static bool isImmConstant(const MachineOperand &Op, int64_t Val) { + return Op.isImm() && Op.getImm() == Val; +} + class GCNPreRAOptimizationsImpl { private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; LiveIntervals *LIS; + MachineLoopInfo *MLI; bool processReg(Register Reg); + bool isSingleUseVReg(Register Reg) const { + return Reg.isVirtual() && MRI->hasOneUse(Reg); + } + + bool isConstMove(MachineInstr &MI, int64_t C) const { + return TII->isFoldableCopy(MI) && isImmConstant(MI.getOperand(1), C); + } + + bool revertConditionalFMAPattern(MachineInstr &FMAInstr); + public: - GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} + GCNPreRAOptimizationsImpl(LiveIntervals *LS, MachineLoopInfo *MLI) + : LIS(LS), MLI(MLI) {} bool run(MachineFunction &MF); }; @@ -75,6 +92,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -84,6 +102,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass { INITIALIZE_PASS_BEGIN(GCNPreRAOptimizationsLegacy, DEBUG_TYPE, "AMDGPU Pre-RA optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_END(GCNPreRAOptimizationsLegacy, DEBUG_TYPE, "Pre-RA optimizations", false, false) @@ -229,14 +248,17 @@ bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; LiveIntervals *LIS = &getAnalysis().getLIS(); - return GCNPreRAOptimizationsImpl(LIS).run(MF); + MachineLoopInfo *MLI = + &getAnalysis().getLI(); + return GCNPreRAOptimizationsImpl(LIS, MLI).run(MF); } PreservedAnalyses GCNPreRAOptimizationsPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { LiveIntervals *LIS = &MFAM.getResult(MF); - GCNPreRAOptimizationsImpl(LIS).run(MF); + MachineLoopInfo *MLI = &MFAM.getResult(MF); + GCNPreRAOptimizationsImpl(LIS, MLI).run(MF); return PreservedAnalyses::all(); } @@ -260,6 +282,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { Changed |= processReg(Reg); } + if (ST.shouldUseConditionalSubToFMAF64()) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : make_early_inc_range(MBB)) { + if (MI.getOpcode() == AMDGPU::V_FMAC_F64_e32) + Changed |= revertConditionalFMAPattern(MI); + } + } + } + if (!ST.useRealTrue16Insts()) return Changed; @@ -295,3 +326,179 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { return Changed; } + +/// Revert conditional subtraction to conditional FMA optimization happened +/// earlier in the selector. The reason is that the optimization uses more +/// instructions and registers to hold constants than original pattern and after +/// rematerializer it becomes clear if those constants are shared with other +/// code. +/// +/// Detects a pattern where an FMA is used to conditionally subtract a value: +/// FMA(dst, cond ? -1.0 : 0.0, value, accum) -> accum - (cond ? value : 0) +/// +/// Pattern detected: +/// v_mov_b32_e32 vNegOneHi, 0xbff00000 ; -1.0 high bits (single use) +/// v_mov_b32_e32 vMul.lo, 0 ; (single use) +/// v_cndmask_b32_e64 vMul.hi, 0, vNegOneHi, vCondReg ; (single use) +/// v_fmac_f64_e32 vDst[0:1], vMul[0:1], vValue[0:1], vAccum[0:1] +/// +/// Transformed to (3 instructions instead of 4, lower register pressure): +/// v_cndmask_b32_e64 vCondValue.lo, 0, vValue.lo, vCondReg +/// v_cndmask_b32_e64 vCondValue.hi, 0, vValue.hi, vCondReg +/// v_add_f64_e64 vDst[0:1], vAccum[0:1], -vCondValue[0:1] +/// +/// For loops: if both constants are initialized before the loop where the +/// v_fmac resides, we keep the original pattern. Ignoring case when v_fmac and +/// v_cndmask aren't in the same loop context as the selector doesn't generate +/// the pattern if v_cndmask is loop invariant. +bool GCNPreRAOptimizationsImpl::revertConditionalFMAPattern( + MachineInstr &FMAInstr) { + assert(FMAInstr.getOpcode() == AMDGPU::V_FMAC_F64_e32); + + MachineOperand *MulOp = + TII->getNamedOperand(FMAInstr, AMDGPU::OpName::src0); + assert(MulOp); + if (!MulOp->isReg() || !isSingleUseVReg(MulOp->getReg())) + return false; + + // Find subregister definitions for the 64-bit multiplicand register + MachineInstr *MulLoDefMI = nullptr; + MachineInstr *MulHiDefMI = nullptr; + + for (auto &DefMI : MRI->def_instructions(MulOp->getReg())) { + if (DefMI.getOperand(0).getSubReg() == AMDGPU::sub0) { + MulLoDefMI = &DefMI; + } else if (DefMI.getOperand(0).getSubReg() == AMDGPU::sub1) { + MulHiDefMI = &DefMI; + } + } + + if (!MulLoDefMI || !isConstMove(*MulLoDefMI, 0)) + return false; + + if (!MulHiDefMI || MulHiDefMI->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) + return false; + + MachineInstr *CndMaskMI = MulHiDefMI; + MachineOperand *CndMaskFalseOp = + TII->getNamedOperand(*CndMaskMI, AMDGPU::OpName::src0); + assert(CndMaskFalseOp); + if (!isImmConstant(*CndMaskFalseOp, 0)) + return false; + + MachineOperand *CndMaskTrueOp = + TII->getNamedOperand(*CndMaskMI, AMDGPU::OpName::src1); + assert(CndMaskTrueOp); + if (!isSingleUseVReg(CndMaskTrueOp->getReg())) + return false; + + // Check that the true operand is -1.0's high 32 bits (0xbff00000) + MachineOperand *NegOneHiDef = MRI->getOneDef(CndMaskTrueOp->getReg()); + if (!NegOneHiDef || + !isConstMove(*NegOneHiDef->getParent(), -1074790400 /* 0xbff00000 */)) + return false; + + MachineInstr *NegOneHiMovMI = NegOneHiDef->getParent(); + + if (MachineLoop *L = MLI->getLoopFor(FMAInstr.getParent())) { + // The selector skips optimization if 'select' is loop invariant, so this is + // more like an assert. + if (MLI->getLoopFor(CndMaskMI->getParent()) != L) + return false; + + // If both constants are initialized before the loop it's still beneficial + // to keep the pattern. + if (MLI->getLoopFor(NegOneHiMovMI->getParent()) != L && + MLI->getLoopFor(MulLoDefMI->getParent()) != L) + return false; + } + + // Perform the revert + auto *DstOpnd = TII->getNamedOperand(FMAInstr, AMDGPU::OpName::vdst); + auto *ValueOpnd = TII->getNamedOperand(FMAInstr, AMDGPU::OpName::src1); + auto *AccumOpnd = TII->getNamedOperand(FMAInstr, AMDGPU::OpName::src2); + auto *CondOpnd = TII->getNamedOperand(*CndMaskMI, AMDGPU::OpName::src2); + assert(DstOpnd && ValueOpnd && AccumOpnd && CondOpnd); + + Register DstReg = DstOpnd->getReg(); + Register ValueReg = ValueOpnd->getReg(); + Register AccumReg = AccumOpnd->getReg(); + Register CondReg = CondOpnd->getReg(); + + // Create a new 64-bit register for the conditional value + Register CondValueReg = + MRI->createVirtualRegister(MRI->getRegClass(ValueReg)); + + MachineBasicBlock::iterator InsertPt = FMAInstr.getIterator(); + DebugLoc DL = FMAInstr.getDebugLoc(); + + // Build: vCondValue.lo = condition ? vValue.lo : 0 + MachineBasicBlock *MBB = FMAInstr.getParent(); + MachineInstr *SelLo = + BuildMI(*MBB, InsertPt, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) + .addReg(CondValueReg, RegState::DefineNoRead, AMDGPU::sub0) + .addImm(0) // src0_modifiers + .addImm(0) // src0 (false value = 0) + .addImm(0) // src1_modifiers + .addReg(ValueReg, 0, AMDGPU::sub0) // src1 (true value = vValue.lo) + .addReg(CondReg) // condition + .getInstr(); + + // Build: vCondValue.hi = condition ? vValue.hi : 0 + MachineInstr *SelHi = + BuildMI(*MBB, InsertPt, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) + .addReg(CondValueReg, RegState::Define, AMDGPU::sub1) + .addImm(0) // src0_modifiers + .addImm(0) // src0 (false value = 0) + .addImm(0) // src1_modifiers + .addReg(ValueReg, 0, AMDGPU::sub1) // src1 (true value = vValue.hi) + .addReg(CondReg) // condition + .getInstr(); + + // Build: vDst = vAccum - vCondValue (negation via src1_modifiers bit) + MachineInstr *Sub = + BuildMI(*MBB, InsertPt, DL, TII->get(AMDGPU::V_ADD_F64_e64)) + .addReg(DstReg, RegState::Define) + .addImm(0) // src0_modifiers + .addReg(AccumReg) // src0 (accumulator) + .addImm(1) // src1_modifiers (negation bit) + .addReg(CondValueReg) // src1 (negated conditional value) + .addImm(0) // clamp + .addImm(0) // omod + .getInstr(); + + // Delete the old instructions + for (MachineInstr *MI : {&FMAInstr, MulLoDefMI, CndMaskMI, NegOneHiMovMI}) { + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + } + + LIS->InsertMachineInstrInMaps(*SelLo); + LIS->InsertMachineInstrInMaps(*SelHi); + LIS->InsertMachineInstrInMaps(*Sub); + + // Removed registers. + LIS->removeInterval(MulOp->getReg()); + LIS->removeInterval(CndMaskTrueOp->getReg()); + + // Reused registers. + LIS->removeInterval(CondReg); + LIS->createAndComputeVirtRegInterval(CondReg); + + LIS->removeInterval(DstReg); + LIS->createAndComputeVirtRegInterval(DstReg); + + // Update AccumReg if it's different from DstReg. + if (AccumReg != DstReg) { + LIS->removeInterval(AccumReg); + LIS->createAndComputeVirtRegInterval(AccumReg); + } + + LIS->removeInterval(ValueReg); + LIS->createAndComputeVirtRegInterval(ValueReg); + + // New register. + LIS->createAndComputeVirtRegInterval(CondValueReg); + + return true; +} diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f377b8aaf1333..24ee82cad0509 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -830,6 +830,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasFmacF64Inst() const { return HasFmacF64Inst; } + /// Returns true if we should use conditional subtraction to FMA optimization. + /// This optimization has no benefit with dual issue instructions available. + bool shouldUseConditionalSubToFMAF64() const { + return HasFmacF64Inst && !HasVOPDInsts; + } + bool hasDot1Insts() const { return HasDot1Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2c689cc8bcbef..69be951751fde 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16295,11 +16295,24 @@ SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, return SDValue(); } -SDValue SITargetLowering::performFAddCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) +/// This function converts patterns like: +/// %correction = select %cond, -1.0, 0.0 +/// %result = a + correction * c +/// into: +/// %result = fma(correction, c, a) +static SDValue createConditionalSubToFMA(SelectionDAG &DAG, SDLoc SL, EVT VT, + SDValue Cond, SDValue C, SDValue A) { + // Constants processed differently. + if (isa(C) || isa(C)) return SDValue(); + const SDValue MinusOne = DAG.getConstantFP(-1.0, SL, VT); + const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); + SDValue Correction = DAG.getNode(ISD::SELECT, SL, VT, Cond, MinusOne, Zero); + return DAG.getNode(ISD::FMA, SL, VT, Correction, C, A); +} +SDValue SITargetLowering::performFAddCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -16307,6 +16320,66 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + // Optimize conditional subtraction patterns that have been transformed to + // fadd during IR canonicalization. These patterns arise from code like: + // result = a - (cond ? c : 0.0) + // which is often rewritten by early optimizations as: + // result = a + (cond ? -c : 0.0) + // + // The optimization converts these patterns to use hardware FMA instructions: + // correction = select %cond, -1.0, 0.0 + // result = fma(correction, c, a) + // + // This is beneficial because: + // 1. FMA is a single instruction vs multiple (select + fadd) + // 2. FMA has better precision than separate operations + // 3. Avoids the overhead of loading the conditional value 'c' + // 4. Works efficiently on AMDGPU which has native FMA support + // + // Only enabled for f64 when hardware FMA is available (FMAC instruction). + // Also check that VOPD is not enabled or we're in wave64 mode, as VOPD + // dual-issue (wave32 only) might be more beneficial than FMA. + if (Subtarget->shouldUseConditionalSubToFMAF64() && + VT == MVT::f64) { // Only optimize if FMA is available + // Pattern 1: fadd %a, (fneg (select %cond, %c, 0.0)) + if (RHS.getOpcode() == ISD::FNEG && + RHS.getOperand(0).getOpcode() == ISD::SELECT) { + SDValue SelNode = RHS.getOperand(0); + SDValue Cond = SelNode.getOperand(0); // condition + SDValue TrueVal = SelNode.getOperand(1); // c + SDValue FalseVal = SelNode.getOperand(2); // should be 0.0 + + // Verify the false branch is exactly 0.0 (not -0.0 or NaN) + if (ConstantFPSDNode *FPConst = dyn_cast(FalseVal)) { + if (FPConst->isExactlyValue(0.0)) { + if (SDValue Result = + createConditionalSubToFMA(DAG, SL, VT, Cond, TrueVal, LHS)) + return Result; + } + } + } + + // Pattern 2: fadd %a, (select %cond, (fneg %c), 0.0) + if (RHS.getOpcode() == ISD::SELECT) { + SDValue Cond = RHS.getOperand(0); // condition + SDValue TrueVal = RHS.getOperand(1); // should be fneg %c + SDValue FalseVal = RHS.getOperand(2); // should be 0.0 + + // Verify the false branch is exactly 0.0, and true branch is negated value + if (ConstantFPSDNode *FPConst = dyn_cast(FalseVal)) { + if (FPConst->isExactlyValue(0.0) && TrueVal.getOpcode() == ISD::FNEG) { + SDValue ActualC = TrueVal.getOperand(0); + if (SDValue Result = + createConditionalSubToFMA(DAG, SL, VT, Cond, ActualC, LHS)) + return Result; + } + } + } + } + + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + return SDValue(); + // These should really be instruction patterns, but writing patterns with // source modifiers is a pain. @@ -16339,12 +16412,53 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, SDValue SITargetLowering::performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - return SDValue(); - SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); EVT VT = N->getValueType(0); + + // Optimize conditional subtraction pattern directly from fsub: + // result = a - (cond ? c : 0.0) + // + // This pattern commonly appears in source code and should be converted to: + // correction = select %cond, -1.0, 0.0 + // result = fma(correction, c, a) + // + // This is the most direct form of the conditional subtraction pattern, + // before any IR canonicalization transforms it to an fadd form. + // + // Benefits of the FMA transformation: + // - Single FMA instruction instead of select + fsub + // - Better numerical precision (no intermediate rounding) + // - More efficient on AMDGPU hardware with native FMA support + // + // Only enabled for f64 when hardware FMA is available (FMAC instruction). + // Also check that VOPD is not enabled or we're in wave64 mode, as VOPD + // dual-issue (wave32 only) might be more beneficial than FMA. + if (VT == MVT::f64 && + Subtarget->shouldUseConditionalSubToFMAF64()) { // Only optimize if FMA is available + SDValue LHS = N->getOperand(0); // a + SDValue RHS = N->getOperand(1); // sel + + // Match the pattern: fsub %a, (select %cond, %c, 0.0) + if (RHS.getOpcode() == ISD::SELECT) { + SDValue Cond = RHS.getOperand(0); // condition + SDValue TrueVal = RHS.getOperand(1); // c + SDValue FalseVal = RHS.getOperand(2); // should be 0.0 + + // Verify false branch is exactly zero (handles IEEE-correct zero check) + if (ConstantFPSDNode *Zero = dyn_cast(FalseVal)) { + if (Zero->isZero()) { + if (SDValue Result = + createConditionalSubToFMA(DAG, SL, VT, Cond, TrueVal, LHS)) + return Result; + } + } + } + } + + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + return SDValue(); + assert(!VT.isVector()); // Try to get the fneg to fold into the source modifier. This undoes generic diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma-cond-sub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma-cond-sub.ll new file mode 100644 index 0000000000000..90af1a382449e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma-cond-sub.ll @@ -0,0 +1,1087 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -global-isel -mcpu=gfx1250 -O2 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1250 %s +; RUN: llc -mtriple=amdgcn -global-isel -mcpu=gfx90a -O2 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn -global-isel -mcpu=gfx90a -O2 -stop-before=amdgpu-pre-ra-optimizations -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A-MIR %s + + +; Test conditional subtraction optimization to FMA on gfx9 with FMA. +; The optimization doesn't make sence on gfx12 due to dual-issued v_cndmask. +; +; Patterns optimized: +; 1. Direct fsub: result = a - (cond ? c : 0.0) +; 2. Canonicalized: result = a + (cond ? -c : 0.0) +; 3. Negated select: result = a + (-(cond ? c : 0.0)) +; +; These are converted to: +; result = fma((cond ? -1.0, 0.0), c, a) +; +; This saves one v_cndmask per pattern which provides the most benefit in loops. +; +; As the optimization may be reverted by amdgpu-pre-ra-optimizations pass +; GFX90A-MIR checks show MIR on the entry to this pass. + +; ============================================================================ +; Basic patterns - single basic block +; ============================================================================ + +; Pattern 1: Direct fsub with select +; result = a - (cond ? c : 0.0) +define double @cond_sub_basic(double %a, double %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_basic: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_basic: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_basic +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY2]], implicit $exec +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[COPY1]], [[COPY]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[COPY]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[COPY]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Expected conversion to FMA and revert + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %a, %sel + ret double %result +} + +; Pattern 2: fadd with negated select (canonicalized form) +; result = a + (cond ? -c : 0.0) +define double @cond_sub_fadd_neg_select(double %a, double %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_fadd_neg_select: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_fadd_neg_select: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_fadd_neg_select +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY2]], implicit $exec +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[COPY1]], [[COPY]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[COPY]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[COPY]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Expected conversion to FMA and revert + %neg_c = fneg double %c + %sel = select i1 %cond, double %neg_c, double 0.0 + %result = fadd double %a, %sel + ret double %result +} + +; Pattern 3: fadd with fneg of select +; result = a + (-(cond ? c : 0.0)) +define double @cond_sub_fadd_fneg(double %a, double %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_fadd_fneg: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_fadd_fneg: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_fadd_fneg +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY2]], implicit $exec +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[COPY1]], [[COPY]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[COPY]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[COPY]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Expected conversion to FMA and revert + %sel = select i1 %cond, double %c, double 0.0 + %neg_sel = fneg double %sel + %result = fadd double %a, %neg_sel + ret double %result +} + +; Test with constant value +define double @cond_sub_constant(double %a, i1 %cond) { +; GFX1250-LABEL: cond_sub_constant: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_bitop2_b32 v3, 1, v2 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 0x40140000, vcc_lo +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_constant: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v3, 1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x40140000 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_constant +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1075052544, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY]], 1, [[AV_MOV_]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[V_ADD_F64_e64_]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[V_ADD_F64_e64_]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Constants aren't expected. + %sel = select i1 %cond, double 5.0, double 0.0 + %result = fsub double %a, %sel + ret double %result +} + +; ============================================================================ +; Multiple patterns in single basic block +; ============================================================================ + +; Two independent conditional subtractions +define double @two_cond_sub(double %a, double %b, double %c1, double %c2, i1 %cond1, i1 %cond2) { +; GFX1250-LABEL: two_cond_sub: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 1, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, 0, v4 :: v_dual_cndmask_b32 v3, 0, v5 +; GFX1250-NEXT: v_dual_add_f64 v[0:1], v[0:1], -v[2:3] :: v_dual_bitop2_b32 v2, 1, v9 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, 0, v6 :: v_dual_cndmask_b32 v3, 0, v7 +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: two_cond_sub: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v3, 1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0xbff00000 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v8, vcc +; GFX90A-NEXT: v_fmac_f64_e32 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_and_b32_e32 v3, 1, v9 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v8, vcc +; GFX90A-NEXT: v_fmac_f64_e32 v[0:1], v[2:3], v[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: two_cond_sub +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr6 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY3]], implicit $exec +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[COPY1]], [[COPY]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY4]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[COPY2]], [[COPY]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[COPY]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[COPY]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Constants are shared between patterns, expecting FMA for both. + %sel1 = select i1 %cond1, double %c1, double 0.0 + %tmp = fsub double %a, %sel1 + %sel2 = select i1 %cond2, double %c2, double 0.0 + %result = fsub double %tmp, %sel2 + ret double %result +} + +; Two conditional subtractions with different base values +define void @two_cond_sub_different_base(ptr %out1, ptr %out2, double %a, double %b, double %c1, double %c2, i1 %cond1, i1 %cond2) { +; GFX1250-LABEL: two_cond_sub_different_base: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX1250-NEXT: v_dual_cndmask_b32 v8, 0, v8, vcc_lo :: v_dual_bitop2_b32 v13, 1, v13 bitop3:0x40 +; GFX1250-NEXT: v_cmp_ne_u32_e64 s0, 0, v13 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_cndmask_b32 v9, 0, v9, vcc_lo :: v_dual_cndmask_b32 v10, 0, v10, s0 +; GFX1250-NEXT: v_dual_add_f64 v[4:5], v[4:5], -v[8:9] :: v_dual_cndmask_b32 v11, 0, v11, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_f64_e64 v[6:7], v[6:7], -v[10:11] +; GFX1250-NEXT: flat_store_b64 v[0:1], v[4:5] +; GFX1250-NEXT: flat_store_b64 v[2:3], v[6:7] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: two_cond_sub_different_base: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0xbff00000 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_cndmask_b32_e32 v15, 0, v16, vcc +; GFX90A-NEXT: v_fmac_f64_e32 v[4:5], v[14:15], v[8:9] +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: v_and_b32_e32 v0, 1, v13 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v15, 0, v16, vcc +; GFX90A-NEXT: v_fmac_f64_e32 v[6:7], v[14:15], v[10:11] +; GFX90A-NEXT: flat_store_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: two_cond_sub_different_base +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr6 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr7 +; GFX90A-MIR-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr8 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr9 +; GFX90A-MIR-NEXT: undef [[COPY5:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr10 +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr11 +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr12 +; GFX90A-MIR-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr13 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY6]], implicit $exec +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[COPY4]], [[COPY2]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out1) +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY7]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[COPY5]], [[COPY3]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out2) +; GFX90A-MIR-NEXT: SI_RETURN +; Constants are shared between patterns, expecting FMA for both. + %sel1 = select i1 %cond1, double %c1, double 0.0 + %result1 = fsub double %a, %sel1 + store double %result1, ptr %out1 + + %sel2 = select i1 %cond2, double %c2, double 0.0 + %result2 = fsub double %b, %sel2 + store double %result2, ptr %out2 + ret void +} + +; ============================================================================ +; Patterns in loops +; ============================================================================ +; Loop with conditional subtraction where c is loop-invariant +define void @cond_sub_loop_invariant(ptr %out, double %a, double %c, i1 %cond, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_invariant: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX1250-NEXT: v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5 +; GFX1250-NEXT: .LBB6_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_add_f64 v[2:3], v[2:3], -v[4:5] :: v_dual_add_nc_u32 v7, -1, v7 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_invariant: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB6_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_invariant +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr6 +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY4]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY2]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY3]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[COPY5]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY5]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY1]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; 'select' is loop-invariant and hoisted before the loop. If we convert this to +; FMA it would be the single instruction from this pattern inside the loop. +; Having instead cheaper 'f_add' looks more efficient and also reduces live-in +; register pressure for the loop. Note that for legacy selector this happens +; automatically as the pattern is split between basic blocks, but requires +; special effort in Global ISel. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result, %loop ] + + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %acc, %sel + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result, ptr %out + ret void +} + +; Loop with conditional subtraction where c depends on loop index +define void @cond_sub_loop_variant(ptr %in, ptr %out, double %a, i1 %cond, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_variant: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_bitop2_b32 v6, 1, v6 bitop3:0x40 +; GFX1250-NEXT: .LBB7_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX1250-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_lshl_add_u64 v[10:11], v[8:9], 3, v[0:1] +; GFX1250-NEXT: v_add_nc_u32_e32 v8, 1, v8 +; GFX1250-NEXT: flat_load_b64 v[10:11], v[10:11] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_dual_cndmask_b32 v10, 0, v10 :: v_dual_cndmask_b32 v11, 0, v11 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_f64_e64 v[4:5], v[4:5], -v[10:11] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: flat_store_b64 v[2:3], v[4:5] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_variant: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0xbff00000 +; GFX90A-NEXT: v_mov_b32_e32 v10, s6 +; GFX90A-NEXT: .LBB7_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX90A-NEXT: v_lshlrev_b64 v[14:15], 3, v[10:11] +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, v0, v14 +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v1, v15, vcc +; GFX90A-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX90A-NEXT: v_add_u32_e32 v10, 1, v10 +; GFX90A-NEXT: v_cndmask_b32_e32 v9, 0, v12, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v7, v10 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_fmac_f64_e32 v[4:5], v[8:9], v[14:15] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: flat_store_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_variant +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr6 +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY4]], implicit $exec +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: undef [[COPY6:%[0-9]+]].sub0:vreg_64_align2 = COPY [[S_MOV_B32_]] +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]].sub1:vreg_64_align2 = V_ASHRREV_I32_e32 31, [[COPY6]].sub0, implicit $exec +; GFX90A-MIR-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 3, [[COPY6]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64_align2, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_LSHLREV_B64_e64_]].sub0, 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64_align2, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[V_LSHLREV_B64_e64_]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec +; GFX90A-MIR-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %ir.ptr) +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[COPY6:%[0-9]+]].sub0:vreg_64_align2 = V_ADD_U32_e32 1, [[COPY6]].sub0, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY5]], [[COPY6]].sub0, implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[FLAT_LOAD_DWORDX2_]], [[COPY3]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY2]], [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; Interesting case where using FMA saves two 'v_cndmasks' in a loop compared to the +; original pattern because the condition is loop independent and 'v_cndmask' only +; selects the correction constant and doesn't depend on 'c'. Actually this can +; be beneficial for gfx12 too. +; TODO: This doesn't work in GlobalISel. It doesn't hoist v_cmp and v_cndmask +; out of the loop. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result, %loop ] + + %ptr = getelementptr double, ptr %in, i32 %i + %c = load double, ptr %ptr + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %acc, %sel + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result, ptr %out + ret void +} + +; Loop where condition depends on loop index +define void @cond_sub_loop_cond_variant(ptr %conds, ptr %out, double %a, double %c, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_cond_variant: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b32_e32 v9, s0 +; GFX1250-NEXT: .LBB8_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_ashrrev_i32_e32 v11, 31, v9 +; GFX1250-NEXT: v_add_co_u32 v10, vcc_lo, v0, v9 +; GFX1250-NEXT: v_add_nc_u32_e32 v9, 1, v9 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, vcc_lo +; GFX1250-NEXT: flat_load_u8 v10, v[10:11] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX1250-NEXT: v_dual_cndmask_b32 v10, 0, v6 :: v_dual_cndmask_b32 v11, 0, v7 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v9 +; GFX1250-NEXT: v_add_f64_e64 v[4:5], v[4:5], -v[10:11] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: flat_store_b64 v[2:3], v[4:5] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_cond_variant: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0xbff00000 +; GFX90A-NEXT: v_mov_b32_e32 v12, s6 +; GFX90A-NEXT: .LBB8_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_ashrrev_i32_e32 v11, 31, v12 +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, v0, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v1, v11, vcc +; GFX90A-NEXT: flat_load_ubyte v11, v[14:15] +; GFX90A-NEXT: v_add_u32_e32 v12, 1, v12 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v12 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX90A-NEXT: v_cndmask_b32_e32 v11, 0, v9, vcc +; GFX90A-NEXT: v_fmac_f64_e32 v[4:5], v[10:11], v[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: flat_store_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_cond_variant +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr6 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 +; GFX90A-MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[V_ASHRREV_I32_e32_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e32 31, [[COPY6]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64_align2, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY6]], 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64_align2, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[V_ASHRREV_I32_e32_]], [[V_ADD_CO_U32_e64_1]], 0, implicit $exec +; GFX90A-MIR-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[V_ADD_CO_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.cond_ptr) +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[COPY6]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY5]], [[COPY6]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[FLAT_LOAD_UBYTE]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[COPY4]], [[COPY3]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY2]], [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; Expect conversion to FMA. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result, %loop ] + + %cond_ptr = getelementptr i1, ptr %conds, i32 %i + %cond = load i1, ptr %cond_ptr + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %acc, %sel + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result, ptr %out + ret void +} + +; Loop where both condition and value depend on loop index +define void @cond_sub_loop_both_variant(ptr %conds, ptr %values, ptr %out, double %a, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_both_variant: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b32_e32 v10, s0 +; GFX1250-NEXT: .LBB9_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX1250-NEXT: v_add_co_u32 v12, vcc_lo, v0, v10 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v11, vcc_lo +; GFX1250-NEXT: flat_load_u8 v9, v[12:13] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_lshl_add_u64 v[12:13], v[10:11], 3, v[2:3] +; GFX1250-NEXT: v_add_nc_u32_e32 v10, 1, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 +; GFX1250-NEXT: flat_load_b64 v[12:13], v[12:13] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_dual_cndmask_b32 v12, 0, v12 :: v_dual_cndmask_b32 v13, 0, v13 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v10 +; GFX1250-NEXT: v_add_f64_e64 v[6:7], v[6:7], -v[12:13] +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: flat_store_b64 v[4:5], v[6:7] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_both_variant: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0xbff00000 +; GFX90A-NEXT: v_mov_b32_e32 v12, s4 +; GFX90A-NEXT: .LBB9_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, v0, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v1, v13, vcc +; GFX90A-NEXT: flat_load_ubyte v11, v[14:15] +; GFX90A-NEXT: v_lshlrev_b64 v[14:15], 3, v[12:13] +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, v2, v14 +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v3, v15, vcc +; GFX90A-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX90A-NEXT: v_add_u32_e32 v12, 1, v12 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v12 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 +; GFX90A-NEXT: v_cndmask_b32_e64 v11, 0, v9, s[4:5] +; GFX90A-NEXT: v_fmac_f64_e32 v[6:7], v[10:11], v[14:15] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_both_variant +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 +; GFX90A-MIR-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: undef [[COPY5:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr6 +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8 +; GFX90A-MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: undef [[COPY7:%[0-9]+]].sub0:vreg_64_align2 = COPY [[S_MOV_B32_]] +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY7:%[0-9]+]].sub1:vreg_64_align2 = V_ASHRREV_I32_e32 31, [[COPY7]].sub0, implicit $exec +; GFX90A-MIR-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64_align2, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY7]].sub0, 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64_align2, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[COPY7]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec +; GFX90A-MIR-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[V_ADD_CO_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.cond_ptr) +; GFX90A-MIR-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 3, [[COPY7]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_ADD_CO_U32_e64_2:%[0-9]+]].sub0:vreg_64_align2, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[V_LSHLREV_B64_e64_]].sub0, 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]].sub1:vreg_64_align2, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[V_LSHLREV_B64_e64_]].sub1, [[V_ADD_CO_U32_e64_3]], 0, implicit $exec +; GFX90A-MIR-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[V_ADD_CO_U32_e64_2]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %ir.value_ptr) +; GFX90A-MIR-NEXT: undef [[COPY7:%[0-9]+]].sub0:vreg_64_align2 = V_ADD_U32_e32 1, [[COPY7]].sub0, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY6]], [[COPY7]].sub0, implicit $exec +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[FLAT_LOAD_UBYTE]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[FLAT_LOAD_DWORDX2_]], [[COPY5]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY4]], [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; Expect conversion to FMA. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result, %loop ] + + %cond_ptr = getelementptr i1, ptr %conds, i32 %i + %cond = load i1, ptr %cond_ptr + %value_ptr = getelementptr double, ptr %values, i32 %i + %c = load double, ptr %value_ptr + + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %acc, %sel + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result, ptr %out + ret void +} + +; ============================================================================ +; Multiple patterns in a loop +; ============================================================================ +; Two conditional subtractions in same loop iteration +define void @cond_sub_loop_two_ops(ptr %out, double %a, double %c1, double %c2, i1 %cond1, i1 %cond2, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_two_ops: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX1250-NEXT: v_dual_cndmask_b32 v4, 0, v4, vcc_lo :: v_dual_bitop2_b32 v9, 1, v9 bitop3:0x40 +; GFX1250-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_cndmask_b32 v5, 0, v5, vcc_lo :: v_dual_cndmask_b32 v6, 0, v6, s0 +; GFX1250-NEXT: v_cndmask_b32_e64 v7, 0, v7, s0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: .LBB10_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_add_f64 v[2:3], v[2:3], -v[4:5] :: v_dual_add_nc_u32 v10, -1, v10 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_add_f64_e64 v[2:3], v[2:3], -v[6:7] +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_two_ops: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX90A-NEXT: v_and_b32_e32 v8, 1, v9 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX90A-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB10_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_add_u32_e32 v10, -1, v10 +; GFX90A-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_two_ops +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr6 +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr8 +; GFX90A-MIR-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr9 +; GFX90A-MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr10 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY6]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY2]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY3]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY7]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_1:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY4]], [[V_CMP_NE_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY5]], [[V_CMP_NE_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY1]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[COPY8]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 0, [[COPY8]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_CNDMASK_B32_e64_1]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; Applies the same comment as for cond_sub_loop_invariant. Expect no FMA conversion. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result2, %loop ] + + %sel1 = select i1 %cond1, double %c1, double 0.0 + %result1 = fsub double %acc, %sel1 + + %sel2 = select i1 %cond2, double %c2, double 0.0 + %result2 = fsub double %result1, %sel2 + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result2, ptr %out + ret void +} + +; ============================================================================ +; Negative tests - patterns that should NOT be optimized +; ============================================================================ + +; f32 should not be optimized (no FMAC f64 for f32) +define float @cond_sub_f32(float %a, float %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX1250-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_f32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_f32 +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY2]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_SUB_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F32_e32 [[COPY]], [[V_CNDMASK_B32_e64_]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[V_SUB_F32_e32_]] +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0 + %sel = select i1 %cond, float %c, float 0.0 + %result = fsub float %a, %sel + ret float %result +} + +; False value is not 0.0 +define double @cond_sub_wrong_false_value(double %a, double %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_wrong_false_value: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX1250-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v3, 0x3ff00000, v3, vcc_lo +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_wrong_false_value: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x3ff00000 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_wrong_false_value +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY3]], implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY2]], [[V_CMP_NE_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[V_ADD_F64_e64_]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[V_ADD_F64_e64_]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + %sel = select i1 %cond, double %c, double 1.0 + %result = fsub double %a, %sel + ret double %result +} diff --git a/llvm/test/CodeGen/AMDGPU/fma-cond-sub.ll b/llvm/test/CodeGen/AMDGPU/fma-cond-sub.ll new file mode 100644 index 0000000000000..721176d69dcff --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fma-cond-sub.ll @@ -0,0 +1,1083 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -O2 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O2 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O2 -stop-before=amdgpu-pre-ra-optimizations -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A-MIR %s + + +; Test conditional subtraction optimization to FMA on gfx9 with FMA. +; The optimization doesn't make sence on gfx12 due to dual-issued v_cndmask. +; +; Patterns optimized: +; 1. Direct fsub: result = a - (cond ? c : 0.0) +; 2. Canonicalized: result = a + (cond ? -c : 0.0) +; 3. Negated select: result = a + (-(cond ? c : 0.0)) +; +; These are converted to: +; result = fma((cond ? -1.0, 0.0), c, a) +; +; This saves one v_cndmask per pattern which provides the most benefit in loops. +; +; As the optimization may be reverted by amdgpu-pre-ra-optimizations pass +; GFX90A-MIR checks show MIR on the entry to this pass. + +; ============================================================================ +; Basic patterns - single basic block +; ============================================================================ + +; Pattern 1: Direct fsub with select +; result = a - (cond ? c : 0.0) +define double @cond_sub_basic(double %a, double %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_basic: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_basic: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_basic +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_CNDMASK_B32_e64_]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[COPY2]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[COPY2]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Expected conversion to FMA and revert + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %a, %sel + ret double %result +} + +; Pattern 2: fadd with negated select (canonicalized form) +; result = a + (cond ? -c : 0.0) +define double @cond_sub_fadd_neg_select(double %a, double %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_fadd_neg_select: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, 0x80000000, v3, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_fadd_neg_select: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_fadd_neg_select +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_CNDMASK_B32_e64_]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[COPY2]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[COPY2]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Expected conversion to FMA and revert + %neg_c = fneg double %c + %sel = select i1 %cond, double %neg_c, double 0.0 + %result = fadd double %a, %sel + ret double %result +} + +; Pattern 3: fadd with fneg of select +; result = a + (-(cond ? c : 0.0)) +define double @cond_sub_fadd_fneg(double %a, double %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_fadd_fneg: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_fadd_fneg: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_fadd_fneg +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_CNDMASK_B32_e64_]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[COPY2]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[COPY2]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Expected conversion to FMA and revert + %sel = select i1 %cond, double %c, double 0.0 + %neg_sel = fneg double %sel + %result = fadd double %a, %neg_sel + ret double %result +} + +; Test with constant value +define double @cond_sub_constant(double %a, i1 %cond) { +; GFX1250-LABEL: cond_sub_constant: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_bitop2_b32 v3, 1, v2 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 0x40140000, vcc_lo +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_constant: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40140000 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_constant +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1075052544, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY1]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[V_ADD_F64_e64_]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[V_ADD_F64_e64_]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Constants aren't expected. + %sel = select i1 %cond, double 5.0, double 0.0 + %result = fsub double %a, %sel + ret double %result +} + +; ============================================================================ +; Multiple patterns in single basic block +; ============================================================================ + +; Two independent conditional subtractions +define double @two_cond_sub(double %a, double %b, double %c1, double %c2, i1 %cond1, i1 %cond2) { +; GFX1250-LABEL: two_cond_sub: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 1, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4 +; GFX1250-NEXT: v_dual_add_f64 v[0:1], v[0:1], -v[2:3] :: v_dual_bitop2_b32 v2, 1, v9 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v3, 0, v7 :: v_dual_cndmask_b32 v2, 0, v6 +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: two_cond_sub: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v2, 1, v8 +; GFX90A-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0xbff00000 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v8, vcc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX90A-NEXT: v_fmac_f64_e32 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v8, vcc +; GFX90A-NEXT: v_fmac_f64_e32 v[0:1], v[2:3], v[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: two_cond_sub +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr6 +; GFX90A-MIR-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_1]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_CNDMASK_B32_e64_]], [[COPY3]], [[COPY4]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_CNDMASK_B32_e64_]], [[COPY2]], [[COPY4]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[COPY4]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[COPY4]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 +; Constants are shared between patterns, expecting FMA for both. + %sel1 = select i1 %cond1, double %c1, double 0.0 + %tmp = fsub double %a, %sel1 + %sel2 = select i1 %cond2, double %c2, double 0.0 + %result = fsub double %tmp, %sel2 + ret double %result +} + +; Two conditional subtractions with different base values +define void @two_cond_sub_different_base(ptr %out1, ptr %out2, double %a, double %b, double %c1, double %c2, i1 %cond1, i1 %cond2) { +; GFX1250-LABEL: two_cond_sub_different_base: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX1250-NEXT: v_dual_cndmask_b32 v8, 0, v8, vcc_lo :: v_dual_bitop2_b32 v13, 1, v13 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v13 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_cndmask_b32 v9, 0, v9, vcc_lo :: v_dual_cndmask_b32 v11, 0, v11, s0 +; GFX1250-NEXT: v_dual_add_f64 v[4:5], v[4:5], -v[8:9] :: v_dual_cndmask_b32 v10, 0, v10, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_f64_e64 v[6:7], v[6:7], -v[10:11] +; GFX1250-NEXT: flat_store_b64 v[0:1], v[4:5] +; GFX1250-NEXT: flat_store_b64 v[2:3], v[6:7] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: two_cond_sub_different_base: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX90A-NEXT: v_and_b32_e32 v14, 1, v13 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0xbff00000 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX90A-NEXT: v_cndmask_b32_e32 v13, 0, v15, vcc +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX90A-NEXT: v_fmac_f64_e32 v[4:5], v[12:13], v[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v13, 0, v15, vcc +; GFX90A-NEXT: v_fmac_f64_e32 v[6:7], v[12:13], v[10:11] +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: flat_store_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: two_cond_sub_different_base +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr12 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr11 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr10 +; GFX90A-MIR-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr9 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr8 +; GFX90A-MIR-NEXT: undef [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr6 +; GFX90A-MIR-NEXT: undef [[COPY5:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY6:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY7:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY7:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_1]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_CNDMASK_B32_e64_]], [[COPY3]], [[COPY5]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY7]], [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out1) +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_CNDMASK_B32_e64_]], [[COPY2]], [[COPY4]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY6]], [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out2) +; GFX90A-MIR-NEXT: SI_RETURN +; Constants are shared between patterns, expecting FMA for both. + %sel1 = select i1 %cond1, double %c1, double 0.0 + %result1 = fsub double %a, %sel1 + store double %result1, ptr %out1 + + %sel2 = select i1 %cond2, double %c2, double 0.0 + %result2 = fsub double %b, %sel2 + store double %result2, ptr %out2 + ret void +} + +; ============================================================================ +; Patterns in loops +; ============================================================================ +; Loop with conditional subtraction where c is loop-invariant +define void @cond_sub_loop_invariant(ptr %out, double %a, double %c, i1 %cond, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_invariant: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX1250-NEXT: v_dual_cndmask_b32 v5, 0, v5 :: v_dual_cndmask_b32 v4, 0, v4 +; GFX1250-NEXT: .LBB6_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_add_f64 v[2:3], v[2:3], -v[4:5] :: v_dual_add_nc_u32 v7, -1, v7 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_invariant: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB6_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_invariant +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr6 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY5:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY3]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_EQ_U32_e64_1]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY4]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY5]], [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; 'select' is loop-invariant and hoisted before the loop. If we convert this to +; FMA it would be the single instruction from this pattern inside the loop. +; Having instead cheaper 'f_add' looks more efficient and also reduces live-in +; register pressure for the loop. Note that for legacy selector this happens +; automatically as the pattern is split between basic blocks, but requires +; special effort in Global ISel. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result, %loop ] + + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %acc, %sel + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result, ptr %out + ret void +} + +; Loop with conditional subtraction where c depends on loop index +define void @cond_sub_loop_variant(ptr %in, ptr %out, double %a, i1 %cond, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_variant: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX1250-NEXT: .LBB7_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_ashr_i32 s3, s2, 31 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_lshl_add_u64 v[8:9], s[2:3], 3, v[0:1] +; GFX1250-NEXT: s_add_co_i32 s2, s2, 1 +; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, s2, v7 +; GFX1250-NEXT: flat_load_b64 v[8:9], v[8:9] +; GFX1250-NEXT: s_or_b32 s1, s0, s1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_dual_cndmask_b32 v9, 0, v9 :: v_dual_cndmask_b32 v8, 0, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_f64_e64 v[4:5], v[4:5], -v[8:9] +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX1250-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1250-NEXT: flat_store_b64 v[2:3], v[4:5] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_variant: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0xbff00000 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_cndmask_b32_e32 v9, 0, v8, vcc +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: .LBB7_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_ashr_i32 s7, s6, 31 +; GFX90A-NEXT: s_lshl_b64 s[8:9], s[6:7], 3 +; GFX90A-NEXT: v_mov_b32_e32 v6, s9 +; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, s8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v6, vcc +; GFX90A-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX90A-NEXT: s_add_i32 s6, s6, 1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_fmac_f64_e32 v[4:5], v[8:9], v[10:11] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: flat_store_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_variant +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr6 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY5:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec +; GFX90A-MIR-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0 +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_ASHR_I32 [[S_MOV_B32_]].sub0, 31, implicit-def dead $scc +; GFX90A-MIR-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[S_MOV_B32_]], 3, implicit-def dead $scc +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_LSHL_B64_]].sub1 +; GFX90A-MIR-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64_align2, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_LSHL_B64_]].sub0, [[COPY5]].sub0, 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64_align2, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]].sub1, [[COPY6]], [[V_ADD_CO_U32_e64_1]], 0, implicit $exec +; GFX90A-MIR-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %ir.ptr) +; GFX90A-MIR-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, 1, implicit-def dead $scc +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[S_MOV_B32_]].sub0, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_EQ_U32_e64_1]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_CNDMASK_B32_e64_]], [[FLAT_LOAD_DWORDX2_]], [[COPY2]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY3]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; Interesting case where using FMA saves two 'v_cndmasks' in a loop compared to the +; original pattern because the condition is loop independent and 'v_cndmask' only +; selects the correction constant and doesn't depend on 'c'. Actually this can +; be beneficial for gfx12 too. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result, %loop ] + + %ptr = getelementptr double, ptr %in, i32 %i + %c = load double, ptr %ptr + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %acc, %sel + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result, ptr %out + ret void +} + +; Loop where condition depends on loop index +define void @cond_sub_loop_cond_variant(ptr %conds, ptr %out, double %a, double %c, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_cond_variant: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: .LBB8_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_ashr_i32 s1, s0, 31 +; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], s[0:1], v[0:1] +; GFX1250-NEXT: s_add_co_i32 s0, s0, 1 +; GFX1250-NEXT: flat_load_u8 v9, v[10:11] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_cndmask_b32 v11, 0, v7 :: v_dual_cndmask_b32 v10, 0, v6 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v8 +; GFX1250-NEXT: v_add_f64_e64 v[4:5], v[4:5], -v[10:11] +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: flat_store_b64 v[2:3], v[4:5] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_cond_variant: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0xbff00000 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: .LBB8_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_ashr_i32 s7, s6, 31 +; GFX90A-NEXT: v_mov_b32_e32 v11, s7 +; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, s6, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, v1, v11, vcc +; GFX90A-NEXT: flat_load_ubyte v11, v[12:13] +; GFX90A-NEXT: s_add_i32 s6, s6, 1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX90A-NEXT: v_cndmask_b32_e32 v11, 0, v9, vcc +; GFX90A-NEXT: v_fmac_f64_e32 v[4:5], v[10:11], v[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: flat_store_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_cond_variant +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr6 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY5:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_MOV_B32_]], 31, implicit-def dead $scc +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_ASHR_I32_]] +; GFX90A-MIR-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64_align2, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY5]].sub0, 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64_align2, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]].sub1, [[COPY6]], [[V_ADD_CO_U32_e64_1]], 0, implicit $exec +; GFX90A-MIR-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[V_ADD_CO_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.cond_ptr) +; GFX90A-MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MOV_B32_]], 1, implicit-def dead $scc +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[FLAT_LOAD_UBYTE]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY3]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; Expect conversion to FMA. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result, %loop ] + + %cond_ptr = getelementptr i1, ptr %conds, i32 %i + %cond = load i1, ptr %cond_ptr + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %acc, %sel + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result, ptr %out + ret void +} + +; Loop where both condition and value depend on loop index +define void @cond_sub_loop_both_variant(ptr %conds, ptr %values, ptr %out, double %a, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_both_variant: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: .LBB9_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_ashr_i32 s1, s0, 31 +; GFX1250-NEXT: v_add_nc_u64_e32 v[10:11], s[0:1], v[0:1] +; GFX1250-NEXT: flat_load_u8 v9, v[10:11] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_lshl_add_u64 v[10:11], s[0:1], 3, v[2:3] +; GFX1250-NEXT: s_add_co_i32 s0, s0, 1 +; GFX1250-NEXT: flat_load_b64 v[10:11], v[10:11] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX1250-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_dual_cndmask_b32 v11, 0, v11 :: v_dual_cndmask_b32 v10, 0, v10 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v8 +; GFX1250-NEXT: v_add_f64_e64 v[6:7], v[6:7], -v[10:11] +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: flat_store_b64 v[4:5], v[6:7] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_both_variant: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s8, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0xbff00000 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: .LBB9_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_ashr_i32 s9, s8, 31 +; GFX90A-NEXT: v_mov_b32_e32 v11, s9 +; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, s8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, v1, v11, vcc +; GFX90A-NEXT: flat_load_ubyte v11, v[12:13] +; GFX90A-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 +; GFX90A-NEXT: v_mov_b32_e32 v13, s5 +; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, s4, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, v3, v13, vcc +; GFX90A-NEXT: flat_load_dwordx2 v[12:13], v[12:13] +; GFX90A-NEXT: s_add_i32 s8, s8, 1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s8, v8 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 +; GFX90A-NEXT: v_cndmask_b32_e64 v11, 0, v9, s[4:5] +; GFX90A-NEXT: v_fmac_f64_e32 v[6:7], v[10:11], v[12:13] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: flat_store_dwordx2 v[4:5], v[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_both_variant +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 +; GFX90A-MIR-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr6 +; GFX90A-MIR-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY5:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: undef [[COPY6:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0 +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec +; GFX90A-MIR-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_ASHR_I32 [[S_MOV_B32_]].sub0, 31, implicit-def dead $scc +; GFX90A-MIR-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]].sub1 +; GFX90A-MIR-NEXT: undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64_align2, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]].sub0, [[COPY6]].sub0, 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64_align2, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]].sub1, [[COPY7]], [[V_ADD_CO_U32_e64_1]], 0, implicit $exec +; GFX90A-MIR-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[V_ADD_CO_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.cond_ptr) +; GFX90A-MIR-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[S_MOV_B32_]], 3, implicit-def dead $scc +; GFX90A-MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_LSHL_B64_]].sub1 +; GFX90A-MIR-NEXT: undef [[V_ADD_CO_U32_e64_2:%[0-9]+]].sub0:vreg_64_align2, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_LSHL_B64_]].sub0, [[COPY4]].sub0, 0, implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]].sub1:vreg_64_align2, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]].sub1, [[COPY8]], [[V_ADD_CO_U32_e64_3]], 0, implicit $exec +; GFX90A-MIR-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[V_ADD_CO_U32_e64_2]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %ir.value_ptr) +; GFX90A-MIR-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, 1, implicit-def dead $scc +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[S_MOV_B32_]].sub0, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[FLAT_LOAD_UBYTE]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[AV_MOV_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[V_CMP_EQ_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[AV_MOV_]], [[FLAT_LOAD_DWORDX2_]], [[COPY1]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY2]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; Expect conversion to FMA. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result, %loop ] + + %cond_ptr = getelementptr i1, ptr %conds, i32 %i + %cond = load i1, ptr %cond_ptr + %value_ptr = getelementptr double, ptr %values, i32 %i + %c = load double, ptr %value_ptr + + %sel = select i1 %cond, double %c, double 0.0 + %result = fsub double %acc, %sel + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result, ptr %out + ret void +} + +; ============================================================================ +; Multiple patterns in a loop +; ============================================================================ +; Two conditional subtractions in same loop iteration +define void @cond_sub_loop_two_ops(ptr %out, double %a, double %c1, double %c2, i1 %cond1, i1 %cond2, i32 %n) { +; GFX1250-LABEL: cond_sub_loop_two_ops: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX1250-NEXT: v_dual_cndmask_b32 v4, 0, v4, vcc_lo :: v_dual_bitop2_b32 v9, 1, v9 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_cndmask_b32 v5, 0, v5, vcc_lo :: v_dual_cndmask_b32 v7, 0, v7, s0 +; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, v6, s0 +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: .LBB10_1: ; %loop +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_add_f64 v[2:3], v[2:3], -v[4:5] :: v_dual_add_nc_u32 v10, -1, v10 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 +; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_add_f64_e64 v[2:3], v[2:3], -v[6:7] +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1250-NEXT: ; %bb.2: ; %exit +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_loop_two_ops: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX90A-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX90A-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB10_1: ; %loop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_add_u32_e32 v10, -1, v10 +; GFX90A-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: ; %bb.2: ; %exit +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_loop_two_ops +; GFX90A-MIR: bb.0.entry: +; GFX90A-MIR-NEXT: successors: %bb.1(0x80000000) +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr10 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr9 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr8 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr7 +; GFX90A-MIR-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr6 +; GFX90A-MIR-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 +; GFX90A-MIR-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: undef [[COPY7:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY7:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY8:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY8:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY2]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_1]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY5]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY6]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_1:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY3]], [[V_CMP_EQ_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY4]], [[V_CMP_EQ_U32_e64_1]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.1.loop: +; GFX90A-MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY7]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_2:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_EQ_U32_e64_2]], [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[V_ADD_F64_e64_]], 1, [[V_CNDMASK_B32_e64_1]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $exec = S_ANDN2_B64_term $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec +; GFX90A-MIR-NEXT: S_BRANCH %bb.2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: bb.2.exit: +; GFX90A-MIR-NEXT: $exec = S_OR_B64 $exec, [[S_MOV_B64_]], implicit-def $scc +; GFX90A-MIR-NEXT: FLAT_STORE_DWORDX2 [[COPY8]], [[COPY7]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.out) +; GFX90A-MIR-NEXT: SI_RETURN +; Applies the same comment as for cond_sub_loop_invariant. Expect no FMA conversion. +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi double [ %a, %entry ], [ %result2, %loop ] + + %sel1 = select i1 %cond1, double %c1, double 0.0 + %result1 = fsub double %acc, %sel1 + + %sel2 = select i1 %cond2, double %c2, double 0.0 + %result2 = fsub double %result1, %sel2 + + %i.next = add i32 %i, 1 + %exit.cond = icmp eq i32 %i.next, %n + br i1 %exit.cond, label %exit, label %loop + +exit: + store double %result2, ptr %out + ret void +} + +; ============================================================================ +; Negative tests - patterns that should NOT be optimized +; ============================================================================ + +; f32 should not be optimized (no FMAC f64 for f32) +define float @cond_sub_f32(float %a, float %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX1250-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_f32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_f32 +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY1]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_SUB_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_SUB_F32_e32 [[COPY2]], [[V_CNDMASK_B32_e64_]], implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[V_SUB_F32_e32_]] +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0 + %sel = select i1 %cond, float %c, float 0.0 + %result = fsub float %a, %sel + ret float %result +} + +; False value is not 0.0 +define double @cond_sub_wrong_false_value(double %a, double %c, i1 %cond) { +; GFX1250-LABEL: cond_sub_wrong_false_value: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, 0x3ff00000, v3, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GFX1250-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX90A-LABEL: cond_sub_wrong_false_value: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x3ff00000 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-MIR-LABEL: name: cond_sub_wrong_false_value +; GFX90A-MIR: bb.0 (%ir-block.0): +; GFX90A-MIR-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 +; GFX90A-MIR-NEXT: {{ $}} +; GFX90A-MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 +; GFX90A-MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 +; GFX90A-MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 +; GFX90A-MIR-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1 +; GFX90A-MIR-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0 +; GFX90A-MIR-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec +; GFX90A-MIR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248, implicit $exec +; GFX90A-MIR-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e32_]], implicit $exec +; GFX90A-MIR-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY1]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY2]], [[V_CMP_EQ_U32_e64_]], implicit $exec +; GFX90A-MIR-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec +; GFX90A-MIR-NEXT: $vgpr0 = COPY [[V_ADD_F64_e64_]].sub0 +; GFX90A-MIR-NEXT: $vgpr1 = COPY [[V_ADD_F64_e64_]].sub1 +; GFX90A-MIR-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + %sel = select i1 %cond, double %c, double 1.0 + %result = fsub double %a, %sel + ret double %result +} diff --git a/llvm/test/CodeGen/AMDGPU/revert-fma-cond-sub.mir b/llvm/test/CodeGen/AMDGPU/revert-fma-cond-sub.mir new file mode 100644 index 0000000000000..bb31f6aef0866 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/revert-fma-cond-sub.mir @@ -0,0 +1,236 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-pre-ra-optimizations -verify-machineinstrs %s -o - | FileCheck %s + +# Test for GCNPreRAOptimizationsImpl::revertConditionalFMAPattern +--- +name: cond_sub_in_loop +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: cond_sub_in_loop + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY1]].sub0, [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY1]].sub1, [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = V_ADD_F64_e64 0, [[COPY]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[V_ADD_F64_e64_]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vreg_64_align2 = COPY $vgpr2_vgpr3 + %2:sreg_64_xexec = COPY $vcc + + bb.1: + ; Pattern: FMA with conditional multiplier (cond ? -1.0 : 0.0) * value + ; Expecting revert. It's almost impossible to get this pattern in a loop + ; unless rematerializer goes wild, but this also checks the reverter in a + ; non-loop context. + %3:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec ; -1.0 high bits (0xbff00000) + undef %4.sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, %3, %2, implicit $exec ; mul.hi = cond ? -1.0_hi : 0 + %4.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec ; mul.lo = 0 + %5:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 %4, %1, %0, implicit $mode, implicit $exec ; accum + mul * value + %0:vreg_64_align2 = COPY %5 + S_BRANCH %bb.1 + + bb.2: + S_ENDPGM 0 +... + +--- +name: cond_sub_const_before_loop +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: cond_sub_const_before_loop + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY1]].sub0, [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY1]].sub1, [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = V_ADD_F64_e64 0, [[COPY]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[V_ADD_F64_e64_]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vreg_64_align2 = COPY $vgpr2_vgpr3 + %2:sreg_64_xexec = COPY $vcc + ; -1 constant initialization before loop + %3:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec + + bb.1: + ; Pattern: Only V_MOV(0) and V_CNDMASK are in loop + ; Same as previous, expecting revert. + undef %4.sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, %3, %2, implicit $exec + %4.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + %5:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 %4, %1, %0, implicit $mode, implicit $exec + %0:vreg_64_align2 = COPY %5 + S_BRANCH %bb.1 + + bb.2: + S_ENDPGM 0 +... + +--- +name: cond_sub_zero_const_before_loop +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: cond_sub_zero_const_before_loop + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY1]].sub0, [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[COPY1]].sub1, [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = V_ADD_F64_e64 0, [[COPY]], 1, [[V_CNDMASK_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[V_ADD_F64_e64_]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vreg_64_align2 = COPY $vgpr2_vgpr3 + %2:sreg_64_xexec = COPY $vcc + ; 0 constant initialization before loop + undef %4.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + + bb.1: + ; Pattern: V_MOV(-1.0) and V_CNDMASK are in loop + ; Same as previous, expecting revert. + %3:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec + %4.sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, %3, %2, implicit $exec + %5:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 %4, %1, %0, implicit $mode, implicit $exec + %0:vreg_64_align2 = COPY %5 + S_BRANCH %bb.1 + + bb.2: + S_ENDPGM 0 +... + +--- +name: cond_sub_both_consts_before_loop +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: cond_sub_both_consts_before_loop + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY $vcc + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec + ; CHECK-NEXT: undef [[V_MOV_B32_e32_1:%[0-9]+]].sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_FMAC_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_MOV_B32_e32_1]], [[COPY1]], [[COPY]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[V_FMAC_F64_e32_]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vreg_64_align2 = COPY $vgpr2_vgpr3 + %2:sreg_64_xexec = COPY $vcc + ; BOTH constant initializations before loop + %3:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec + undef %4.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + + bb.1: + ; Pattern: Only V_FMAC and V_CNDMASK are in loop + ; This is the most expected pattern, no revert should be done. + %4.sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, %3, %2, implicit $exec + %5:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 %4, %1, %0, implicit $mode, implicit $exec + %0:vreg_64_align2 = COPY %5 + S_BRANCH %bb.1 + + bb.2: + S_ENDPGM 0 +... +--- +name: cond_only_fma_in_loop +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: cond_only_fma_in_loop + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY $vcc + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec + ; CHECK-NEXT: undef [[V_MOV_B32_e32_1:%[0-9]+]].sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]].sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, [[V_MOV_B32_e32_]], [[COPY2]], implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_FMAC_F64_e32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 [[V_MOV_B32_e32_1]], [[COPY1]], [[COPY]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[V_FMAC_F64_e32_]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc + %0:vreg_64_align2 = COPY $vgpr0_vgpr1 + %1:vreg_64_align2 = COPY $vgpr2_vgpr3 + %2:sreg_64_xexec = COPY $vcc + ; BOTH constant initializations and V_CNDMASK before loop + %3:vgpr_32 = V_MOV_B32_e32 -1074790400, implicit $exec + undef %4.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + %4.sub1:vreg_64_align2 = V_CNDMASK_B32_e64 0, 0, 0, %3, %2, implicit $exec + + bb.1: + ; Pattern: Only V_FMAC is in loop + ; Selectors don't generate this pattern, but it probably may come from user + ; code. Ideally we should revert this to save registers and use cheaper f_add + ; but skip this for now as it's harder to revert. + %5:vreg_64_align2 = nofpexcept V_FMAC_F64_e32 %4, %1, %0, implicit $mode, implicit $exec + %0:vreg_64_align2 = COPY %5 + S_BRANCH %bb.1 + + bb.2: + S_ENDPGM 0 +...