IGCVectorizer now supports I32 PHI

esukhov · igcbot · commit cca2a9fe60be · 2025-08-28T16:00:49.000+02:00
IGCVectorizer now supports I32 Phi instructions.
diff --git a/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp b/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp
@@ -259,6 +259,11 @@ bool isFloatTyped(Instruction *I) {
   return I->getType()->isFloatTy();
 }
 
+bool isAllowedType(Instruction *I) {
+    return isFloatTyped(I) ||
+        (IGC_GET_FLAG_VALUE(VectorizerAllowI32) && I->getType()->isIntegerTy(32));
+}
+
 bool isIntrinsicSafe(Instruction *I) {
   bool Result = false;
   IntrinsicInst *IntrinsicI = llvm::dyn_cast<IntrinsicInst>(I);
@@ -297,14 +302,14 @@ bool isSafeToVectorize(Instruction *I) {
 
   // the only typed instructions we add to slices => Insert elements
   bool IsVectorTyped = I->getType()->isVectorTy();
-  bool IsFloat = isFloatTyped(I);
+  bool IsAllowedType = isAllowedType(I);
 
   bool Result =
       isPHISafe(I) || IsExtract ||
       isBinarySafe(I) || isIntrinsicSafe(I) || isAllowedStub(I);
 
   // all allowed instructions that are float typed and not vectors
-  Result = (Result && IsFloat && !IsVectorTyped);
+  Result = (Result && IsAllowedType && !IsVectorTyped);
   // always allowed
   Result |= IsFpTrunc;
   // only Float insert elements are allowed
diff --git a/IGC/Compiler/tests/EmitVISAPass/vectorizer-vector-emission-phi-i32.ll b/IGC/Compiler/tests/EmitVISAPass/vectorizer-vector-emission-phi-i32.ll
@@ -0,0 +1,51 @@
+; UNSUPPORTED: system-windows
+; REQUIRES: regkeys
+
+; RUN: igc_opt -S -dce -platformpvc -rev-id B -has-emulated-64-bit-insts -igc-emit-visa --regkey=DumpVISAASMToConsole=1 -simd-mode 16 < %s | FileCheck %s
+
+; CHECK: .decl vectorized_phi v_type=G type=d num_elts=128 align=wordx32
+
+; CHECK: mov (M1, 16) vectorized_phi(0,0)<1> 0x0:d
+; CHECK: mov (M1, 16) vectorized_phi(1,0)<1> 0x0:d
+; CHECK: mov (M1, 16) vectorized_phi(2,0)<1> 0x0:d
+; CHECK: mov (M1, 16) vectorized_phi(3,0)<1> 0x0:d
+; CHECK: mov (M1, 16) vectorized_phi(4,0)<1> 0x0:d
+; CHECK: mov (M1, 16) vectorized_phi(5,0)<1> 0x0:d
+; CHECK: mov (M1, 16) vectorized_phi(6,0)<1> 0x0:d
+; CHECK: mov (M1, 16) vectorized_phi(7,0)<1> 0x0:d
+
+; CHECK: dpas.?.?.0.0 (M1, 16) vectorized_phi.0 vectorized_phi.0
+
+
+; ModuleID = 'vectorizer-vector-emission-fmad.ll'
+source_filename = "vectorizer-vector-emission-fmad.ll"
+
+define spir_kernel void @_attn_fwd(half addrspace(1)* %0, half addrspace(1)* %1, half addrspace(1)* %2, float %3, i8 addrspace(1)* %4, float addrspace(1)* %5, <8 x i32> %r0, <8 x i32> %payloadHeader, i8* %privateBase, i32 %bufferOffset, i32 %bufferOffset1, i32 %bufferOffset2, i32 %bufferOffset3, i32 %bufferOffset4) {
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.._crit_edge_crit_edge, %6
+  %7 = phi float [ 0.000000e+00, %6 ], [ %7, %._crit_edge.._crit_edge_crit_edge ]
+  %vectorized_phi = phi <8 x i32> [ zeroinitializer, %6 ], [ %8, %._crit_edge.._crit_edge_crit_edge ]
+  %8 = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32> %vectorized_phi, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+  br label %._crit_edge.._crit_edge_crit_edge
+
+._crit_edge.._crit_edge_crit_edge:                ; preds = %._crit_edge
+  br label %._crit_edge
+}
+
+declare <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1)
+
+!igc.functions = !{!0}
+!IGCMetadata = !{!4}
+
+!0 = distinct !{void (half addrspace(1)*, half addrspace(1)*, half addrspace(1)*, float, i8 addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, i8*, i32, i32, i32, i32, i32)* @_attn_fwd, !1}
+!1 = distinct !{!2, !3}
+!2 = distinct !{!"function_type", i32 0}
+!3 = distinct !{!"sub_group_size", i32 16}
+!4 = distinct !{!"ModuleMD", !5}
+!5 = distinct !{!"FuncMD", !6, !7}
+!6 = distinct !{!"FuncMDMap[0]", void (half addrspace(1)*, half addrspace(1)*, half addrspace(1)*, float, i8 addrspace(1)*, float addrspace(1)*, <8 x i32>, <8 x i32>, i8*, i32, i32, i32, i32, i32)* @_attn_fwd}
+!7 = distinct !{!"FuncMDValue[0]", !8}
+!8 = distinct !{!"resAllocMD", !9}
+!9 = distinct !{!"argAllocMDList", !10}
+!10 = distinct !{!"argAllocMDListVec[0]"}
diff --git a/IGC/Compiler/tests/IGCVectorizer/vectorizer-not-add-i32.ll b/IGC/Compiler/tests/IGCVectorizer/vectorizer-not-add-i32.ll
@@ -0,0 +1,90 @@
+; REQUIRES: regkeys
+; RUN: igc_opt -S  --igc-vectorizer -dce  --regkey=VectorizerLog=1 --regkey=VectorizerLogToErr=1 < %s 2>&1 | FileCheck %s
+
+; CHECK: Start:   %25 = insertelement <8 x i32> zeroinitializer, i32 %17, i64 0
+; CHECK: Operand [1]:  First:   %17 = mul i32 %9, %1
+; CHECK:  Not safe to vectorize
+
+; CHECK: some elements weren't even vectorized
+
+; CHECK: %1 = phi i32 [ 0, %0 ], [ %35, %._crit_edge ]
+; CHECK: %2 = phi i32 [ 1, %0 ], [ %36, %._crit_edge ]
+; CHECK: %3 = phi i32 [ 2, %0 ], [ %37, %._crit_edge ]
+; CHECK: %4 = phi i32 [ 3, %0 ], [ %38, %._crit_edge ]
+; CHECK: %5 = phi i32 [ 4, %0 ], [ %39, %._crit_edge ]
+; CHECK: %6 = phi i32 [ 5, %0 ], [ %40, %._crit_edge ]
+; CHECK: %7 = phi i32 [ 6, %0 ], [ %41, %._crit_edge ]
+; CHECK: %8 = phi i32 [ 7, %0 ], [ %42, %._crit_edge ]
+; CHECK-NOT: %vectorized_phi
+
+
+; ModuleID = 'reduced.ll'
+source_filename = "initial_test.ll"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @quux() {
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge, %0
+  %1 = phi i32 [ 0, %0 ], [ %35, %._crit_edge ]
+  %2 = phi i32 [ 1, %0 ], [ %36, %._crit_edge ]
+  %3 = phi i32 [ 2, %0 ], [ %37, %._crit_edge ]
+  %4 = phi i32 [ 3, %0 ], [ %38, %._crit_edge ]
+  %5 = phi i32 [ 4, %0 ], [ %39, %._crit_edge ]
+  %6 = phi i32 [ 5, %0 ], [ %40, %._crit_edge ]
+  %7 = phi i32 [ 6, %0 ], [ %41, %._crit_edge ]
+  %8 = phi i32 [ 7, %0 ], [ %42, %._crit_edge ]
+  %9  = add i32 %1, 1
+  %10 = add i32 %2, 2
+  %11 = add i32 %3, 3
+  %12 = add i32 %4, 4
+  %13 = add i32 %5, 5
+  %14 = add i32 %6, 6
+  %15 = add i32 %7, 7
+  %16 = add i32 %8, 8
+  %17 = mul i32 %9, %1
+  %18 = mul i32 %10, %2
+  %19 = mul i32 %11, %3
+  %20 = mul i32 %12, %4
+  %21 = mul i32 %13, %5
+  %22 = mul i32 %14, %6
+  %23 = mul i32 %15, %7
+  %24 = mul i32 %16, %8
+  %25 = insertelement <8 x i32> zeroinitializer, i32 %17, i64 0
+  %26 = insertelement <8 x i32> %25, i32 %18, i64 1
+  %27 = insertelement <8 x i32> %26, i32 %19, i64 2
+  %28 = insertelement <8 x i32> %27, i32 %20, i64 3
+  %29 = insertelement <8 x i32> %28, i32 %21, i64 4
+  %30 = insertelement <8 x i32> %29, i32 %22, i64 5
+  %31 = insertelement <8 x i32> %30, i32 %23, i64 6
+  %32 = insertelement <8 x i32> %31, i32 %24, i64 7
+  %33 = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32> %32, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+  %34 = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32> %33, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+  %35 = extractelement <8 x i32> %34, i64 0
+  %36 = extractelement <8 x i32> %34, i64 1
+  %37 = extractelement <8 x i32> %34, i64 2
+  %38 = extractelement <8 x i32> %34, i64 3
+  %39 = extractelement <8 x i32> %34, i64 4
+  %40 = extractelement <8 x i32> %34, i64 5
+  %41 = extractelement <8 x i32> %34, i64 6
+  %42 = extractelement <8 x i32> %34, i64 7
+  br label %._crit_edge
+}
+
+; Function Attrs: convergent nounwind readnone willreturn
+declare <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.exp2.i32(i32) #2
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone willreturn }
+attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!igc.functions = !{!0}
+!0 = !{void ()* @quux, !1}
+!1 = !{!2, !3}
+!2 = !{!"function_type", i32 0}
+!3 = !{!"sub_group_size", i32 16}
diff --git a/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-dpas-phi-i32.ll b/IGC/Compiler/tests/IGCVectorizer/vectorizer-test-dpas-phi-i32.ll
@@ -0,0 +1,90 @@
+; RUN: igc_opt --igc-vectorizer -S -dce < %s 2>&1 | FileCheck %s
+
+define spir_kernel void @quux() {
+; CHECK-LABEL: @quux(
+; CHECK-NEXT:  bb43:
+; CHECK-NEXT:    br label [[BB123:%.*]]
+; CHECK:       bb60:
+; CHECK-NEXT:    br label [[BB88:%.*]]
+; CHECK:       bb88:
+; CHECK-NEXT:    [[VECTORIZED_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[BB60:%.*]] ], [ [[TMP113:%.*]], [[BB88]] ]
+; CHECK-NEXT:    [[TMP112:%.*]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32> [[VECTORIZED_PHI]], <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+; CHECK-NEXT:    [[TMP113]] = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32> zeroinitializer, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+; CHECK-NEXT:    br i1 false, label [[BB88]], label [[BB123]]
+; CHECK:       bb123:
+; CHECK-NEXT:    [[VECTORIZED_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[BB43:%.*]] ], [ [[TMP113]], [[BB88]] ]
+; CHECK-NEXT:    [[TMP151:%.*]] = bitcast <8 x i32> [[VECTORIZED_PHI1]] to <8 x i32>
+; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false, i32 0, <8 x i32> [[TMP151]])
+; CHECK-NEXT:    ret void
+;
+bb43:
+  br label %bb123
+
+bb60:                                             ; No predecessors!
+  br label %bb88
+
+bb88:                                             ; preds = %bb88, %bb60
+  %tmp90 = phi i32 [ 0, %bb60 ], [ %tmp114, %bb88 ]
+  %tmp91 = phi i32 [ 0, %bb60 ], [ %tmp115, %bb88 ]
+  %tmp92 = phi i32 [ 0, %bb60 ], [ %tmp116, %bb88 ]
+  %tmp93 = phi i32 [ 0, %bb60 ], [ %tmp117, %bb88 ]
+  %tmp94 = phi i32 [ 0, %bb60 ], [ %tmp118, %bb88 ]
+  %tmp95 = phi i32 [ 0, %bb60 ], [ %tmp119, %bb88 ]
+  %tmp96 = phi i32 [ 0, %bb60 ], [ %tmp120, %bb88 ]
+  %tmp97 = phi i32 [ 0, %bb60 ], [ %tmp121, %bb88 ]
+  %tmp104 = insertelement <8 x i32> zeroinitializer, i32 %tmp90, i64 0
+  %tmp105 = insertelement <8 x i32> %tmp104, i32 %tmp91, i64 1
+  %tmp106 = insertelement <8 x i32> %tmp105, i32 %tmp92, i64 2
+  %tmp107 = insertelement <8 x i32> %tmp106, i32 %tmp93, i64 3
+  %tmp108 = insertelement <8 x i32> %tmp107, i32 %tmp94, i64 4
+  %tmp109 = insertelement <8 x i32> %tmp108, i32 %tmp95, i64 5
+  %tmp110 = insertelement <8 x i32> %tmp109, i32 %tmp96, i64 6
+  %tmp111 = insertelement <8 x i32> %tmp110, i32 %tmp97, i64 7
+  %tmp112 = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32> %tmp111, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+  %tmp113 = call <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32> zeroinitializer, <8 x i16> zeroinitializer, <8 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
+  %tmp114 = extractelement <8 x i32> %tmp113, i64 0
+  %tmp115 = extractelement <8 x i32> %tmp113, i64 1
+  %tmp116 = extractelement <8 x i32> %tmp113, i64 2
+  %tmp117 = extractelement <8 x i32> %tmp113, i64 3
+  %tmp118 = extractelement <8 x i32> %tmp113, i64 4
+  %tmp119 = extractelement <8 x i32> %tmp113, i64 5
+  %tmp120 = extractelement <8 x i32> %tmp113, i64 6
+  %tmp121 = extractelement <8 x i32> %tmp113, i64 7
+  br i1 false, label %bb88, label %bb123
+
+bb123:                                            ; preds = %bb88, %bb43
+  %tmp133 = phi i32 [ 0, %bb43 ], [ %tmp114, %bb88 ]
+  %tmp134 = phi i32 [ 0, %bb43 ], [ %tmp115, %bb88 ]
+  %tmp135 = phi i32 [ 0, %bb43 ], [ %tmp116, %bb88 ]
+  %tmp136 = phi i32 [ 0, %bb43 ], [ %tmp117, %bb88 ]
+  %tmp137 = phi i32 [ 0, %bb43 ], [ %tmp118, %bb88 ]
+  %tmp138 = phi i32 [ 0, %bb43 ], [ %tmp119, %bb88 ]
+  %tmp139 = phi i32 [ 0, %bb43 ], [ %tmp120, %bb88 ]
+  %tmp140 = phi i32 [ 0, %bb43 ], [ %tmp121, %bb88 ]
+  %tmp143 = insertelement <8 x i32> zeroinitializer, i32 %tmp133, i64 0
+  %tmp144 = insertelement <8 x i32> %tmp143, i32 %tmp134, i64 1
+  %tmp145 = insertelement <8 x i32> %tmp144, i32 %tmp135, i64 2
+  %tmp146 = insertelement <8 x i32> %tmp145, i32 %tmp136, i64 3
+  %tmp147 = insertelement <8 x i32> %tmp146, i32 %tmp137, i64 4
+  %tmp148 = insertelement <8 x i32> %tmp147, i32 %tmp138, i64 5
+  %tmp149 = insertelement <8 x i32> %tmp148, i32 %tmp139, i64 6
+  %tmp150 = insertelement <8 x i32> %tmp149, i32 %tmp140, i64 7
+  %tmp151 = bitcast <8 x i32> %tmp150 to <8 x i32>
+  call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false, i32 0, <8 x i32> %tmp151)
+  ret void
+}
+
+declare <8 x i32> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x i32>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1)
+
+declare <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32)
+
+declare <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32)
+
+declare void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>)
+
+!igc.functions = !{!0}
+!0 = !{void ()* @quux, !1}
+!1 = !{!2, !3}
+!2 = !{!"function_type", i32 0}
+!3 = !{!"sub_group_size", i32 16}
+
diff --git a/IGC/common/igc_flags.h b/IGC/common/igc_flags.h
@@ -963,6 +963,7 @@ DECLARE_IGC_REGKEY(DWORD, VectorizerDepWindowMultiplier, 2,
 DECLARE_IGC_REGKEY(bool, VectorizerCheckScalarizer, false, "Add scalariser after vectorizer to check performance", true)
 DECLARE_IGC_REGKEY(DWORD, VectorizerList, -1, "Vectorize only one seed instruction with the provided number", true)
 DECLARE_IGC_REGKEY(bool, EnableVectorEmitter, true, "Enable Vector Emission for a vectorizer", true)
+DECLARE_IGC_REGKEY(bool, VectorizerAllowI32, true, "Allow I32 versions of instructions inside vectorizer", true)
 DECLARE_IGC_REGKEY(bool, VectorizerAllowFPTRUNC, true, "Allow FPTRUNC instructions inside vectorizer", true)
 DECLARE_IGC_REGKEY(bool, VectorizerAllowFDIV, true, "Allow FDIV instructions inside vectorizer", true)
 DECLARE_IGC_REGKEY(bool, VectorizerAllowFMUL, true, "Allow FMUL instructions inside vectorizer", true)