From ebda35369dcd698ebf82696b2239f76ff3e1de98 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 30 Oct 2025 09:44:25 -0700 Subject: [PATCH 1/2] [AMDGPU] Support bfloat comparison for ballot intrinsic (#165495) We do not have native instructions for direct bfloat comparisons. However, we can expand bfloat to float, and do float comparison instead. TODO: handle bfloat comparison for ballot intrinsic on global isel path. Fixes: SWDEV-563403 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +++++++-- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 21 +++++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 12 +++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index eef4a29fe49ef..d8e5ad9a6aa02 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6430,9 +6430,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SDLoc SL(N); if (Src.getOpcode() == ISD::SETCC) { + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + // Need to expand bfloat to float for comparison (setcc). + if (Op0.getValueType() == MVT::bf16) { + Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0); + Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1); + } // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) - return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), - Src.getOperand(1), Src.getOperand(2)); + return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2)); } if (const ConstantSDNode *Arg = dyn_cast(Src)) { // (ballot 0) -> 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll index e00e1f13b2b77..9940ea70d3467 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -591,3 +591,24 @@ exit: store i32 %ballot, ptr addrspace(1) %out ret void } + +define amdgpu_cs i32 @compare_bfloats(bfloat %x, bfloat %y) { +; GFX10-LABEL: compare_bfloats: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: compare_bfloats: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v1, v2 +; GFX11-NEXT: ; return to shader part epilog + %cmp = fcmp ogt bfloat %x, %y + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) + ret i32 %ballot +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll index b4adf7f641550..1720a62eb6367 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -557,3 +557,15 @@ exit: store i64 %ballot, ptr addrspace(1) %out ret void } + +define amdgpu_cs i64 @compare_bfloats(bfloat %x, bfloat %y) { +; CHECK-LABEL: compare_bfloats: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CHECK-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1 +; CHECK-NEXT: ; return to shader part epilog + %cmp = fcmp ogt bfloat %x, %y + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} From 219cce93516292a129287d1e90cf009b4a34e5fc Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Tue, 4 Nov 2025 14:52:17 -0800 Subject: [PATCH 2/2] [AMDGPU] Support bfloat comparison for ballot intrinsic (#165495) Fix the lit test failure --- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll index 9940ea70d3467..35cd0fdc5281a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -593,21 +593,12 @@ exit: } define amdgpu_cs i32 @compare_bfloats(bfloat %x, bfloat %y) { -; GFX10-LABEL: compare_bfloats: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: compare_bfloats: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-NEXT: v_mov_b16_e32 v2.h, v1.l -; GFX11-NEXT: v_mov_b16_e32 v1.h, v0.l -; GFX11-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v1, v2 -; GFX11-NEXT: ; return to shader part epilog +; CHECK-LABEL: compare_bfloats: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 +; CHECK-NEXT: ; return to shader part epilog %cmp = fcmp ogt bfloat %x, %y %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) ret i32 %ballot