|
| 1 | +;=========================== begin_copyright_notice ============================ |
| 2 | +; |
| 3 | +; Copyright (C) 2025 Intel Corporation |
| 4 | +; |
| 5 | +; SPDX-License-Identifier: MIT |
| 6 | +; |
| 7 | +;============================ end_copyright_notice ============================= |
| 8 | + |
| 9 | +; REQUIRES: regkeys |
| 10 | +; RUN: igc_opt --opaque-pointers --regkey DisableCodeScheduling=0 --regkey EnableCodeSchedulingIfNoSpills=1 \ |
| 11 | +; RUN: --regkey PrintToConsole=1 --regkey DumpCodeScheduling=1 --igc-code-scheduling \ |
| 12 | +; RUN: --regkey CodeSchedulingRPThreshold=-512 \ |
| 13 | +; RUN: --regkey "CodeSchedulingConfig=10;1;0;30000;100;0;100000;1000;6000;200;10;20;500;50;0;1;1;0;1;1;8;1;32;1;200;64;0;1;20;200;200;5;16;16;34;200;1;0;128;256" \ |
| 14 | +; RUN: --regkey ForceOCLSIMDWidth=32 -S %s 2>&1 | FileCheck %s |
| 15 | + |
| 16 | + |
| 17 | +; Checks that the register pressure is estimated correctly for the special cases related to vector shuffles. |
| 18 | + |
| 19 | +define spir_kernel void @vector_shuffle_no_op(ptr addrspace(1) %A) { |
| 20 | +; CHECK: Function vector_shuffle_no_op |
| 21 | +; CHECK: Greedy MW attempt |
| 22 | +; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}} [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64 |
| 23 | + |
| 24 | +; (6, 512 ) MW: Node #1, MW: 3000 %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4) |
| 25 | +; adds 512 bytes |
| 26 | +; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}} [[LOAD2D:%.*]] = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 [[BASE_ADDR]], i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4) |
| 27 | + |
| 28 | +; the EE and IE instructions are marked as NOP and don't add regpressure |
| 29 | +; (22, 0 ) Im: NOP Node #2, MW: 3000 %EE1.0 = extractelement <8 x i16> %load2d, i32 0 |
| 30 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE1_0:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 0 |
| 31 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE1_0:%.*]] = insertelement <4 x i16> undef, i16 [[EE1_0]], i32 0 |
| 32 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE1_1:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 1 |
| 33 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE1_1:%.*]] = insertelement <4 x i16> [[IE1_0]], i16 [[EE1_1]], i32 1 |
| 34 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE1_2:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 2 |
| 35 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE1_2:%.*]] = insertelement <4 x i16> [[IE1_1]], i16 [[EE1_2]], i32 2 |
| 36 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE1_3:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 3 |
| 37 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE1_3:%.*]] = insertelement <4 x i16> [[IE1_2]], i16 [[EE1_3]], i32 3 |
| 38 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE2_0:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 4 |
| 39 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE2_0:%.*]] = insertelement <4 x i16> undef, i16 [[EE2_0]], i32 0 |
| 40 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE2_1:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 5 |
| 41 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE2_1:%.*]] = insertelement <4 x i16> [[IE2_0]], i16 [[EE2_1]], i32 1 |
| 42 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE2_2:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 6 |
| 43 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE2_2:%.*]] = insertelement <4 x i16> [[IE2_1]], i16 [[EE2_2]], i32 2 |
| 44 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE2_3:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 7 |
| 45 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE2_3:%.*]] = insertelement <4 x i16> [[IE2_2]], i16 [[EE2_3]], i32 3 |
| 46 | + |
| 47 | +; (22, 512 ) MW: Node #34, MW: 0 %dpas1 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE1.7, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 48 | +; (38, 0 ) MW: Node #35, MW: 0 %dpas2 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE2.7, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 49 | +; first DPAS increases the regpressure by 512 bytes, the second one doesn't add any regpressure because the whole vector dies |
| 50 | +; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}} [[DPAS1:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> [[IE1_3]], <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 51 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*[ ]*}} [[DPAS2:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> [[IE2_3]], <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 52 | + |
| 53 | +; CHECK: ret void |
| 54 | +; |
| 55 | +entry: |
| 56 | + %base_addr = ptrtoint ptr addrspace(1) %A to i64 |
| 57 | + %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4) |
| 58 | + %EE1.0 = extractelement <8 x i16> %load2d, i32 0 |
| 59 | + %IE1.0 = insertelement <4 x i16> undef, i16 %EE1.0, i32 0 |
| 60 | + %EE1.1 = extractelement <8 x i16> %load2d, i32 1 |
| 61 | + %IE1.1 = insertelement <4 x i16> %IE1.0, i16 %EE1.1, i32 1 |
| 62 | + %EE1.2 = extractelement <8 x i16> %load2d, i32 2 |
| 63 | + %IE1.2 = insertelement <4 x i16> %IE1.1, i16 %EE1.2, i32 2 |
| 64 | + %EE1.3 = extractelement <8 x i16> %load2d, i32 3 |
| 65 | + %IE1.3 = insertelement <4 x i16> %IE1.2, i16 %EE1.3, i32 3 |
| 66 | + |
| 67 | + %EE2.0 = extractelement <8 x i16> %load2d, i32 4 |
| 68 | + %IE2.0 = insertelement <4 x i16> undef, i16 %EE2.0, i32 0 |
| 69 | + %EE2.1 = extractelement <8 x i16> %load2d, i32 5 |
| 70 | + %IE2.1 = insertelement <4 x i16> %IE2.0, i16 %EE2.1, i32 1 |
| 71 | + %EE2.2 = extractelement <8 x i16> %load2d, i32 6 |
| 72 | + %IE2.2 = insertelement <4 x i16> %IE2.1, i16 %EE2.2, i32 2 |
| 73 | + %EE2.3 = extractelement <8 x i16> %load2d, i32 7 |
| 74 | + %IE2.3 = insertelement <4 x i16> %IE2.2, i16 %EE2.3, i32 3 |
| 75 | + |
| 76 | + %dpas1 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE1.3, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 77 | + %dpas2 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE2.3, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 78 | + ret void |
| 79 | +} |
| 80 | + |
| 81 | + |
| 82 | +define spir_kernel void @vector_shuffle(ptr addrspace(1) %A) { |
| 83 | +; CHECK: Function vector_shuffle |
| 84 | +; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}} [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64 |
| 85 | + |
| 86 | +; (6, 512 ) MW: Node #1, MW: 3000 %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4) |
| 87 | +; adds 512 bytes, but we also estimate the regpressure burst from the shuffles, so use 2x (1024 bytes) when making the decision |
| 88 | +; CHECK: {{([0-9]+,[ ]*1024[ ]*).*[ ]*}} [[LOAD2D:%.*]] = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 [[BASE_ADDR]], i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4) |
| 89 | + |
| 90 | + |
| 91 | +; the EE and IE instructions are marked as VS. IEs add regpressure. The last IE kills the original vector |
| 92 | + |
| 93 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE1_0:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 0 |
| 94 | +; CHECK: {{([0-9]+,[ ]*256[ ]*).*VS.*[ ]*}} [[IE1_0:%.*]] = insertelement <4 x i16> undef, i16 [[EE1_0]], i32 0 |
| 95 | + |
| 96 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE1_1:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 2 |
| 97 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE1_1:%.*]] = insertelement <4 x i16> [[IE1_0]], i16 [[EE1_1]], i32 1 |
| 98 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE1_2:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 4 |
| 99 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE1_2:%.*]] = insertelement <4 x i16> [[IE1_1]], i16 [[EE1_2]], i32 2 |
| 100 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE1_3:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 6 |
| 101 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE1_3:%.*]] = insertelement <4 x i16> [[IE1_2]], i16 [[EE1_3]], i32 3 |
| 102 | + |
| 103 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE2_0:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 1 |
| 104 | +; CHECK: {{([0-9]+,[ ]*256[ ]*).*VS.*[ ]*}} [[IE2_0:%.*]] = insertelement <4 x i16> undef, i16 [[EE2_0]], i32 0 |
| 105 | + |
| 106 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE2_1:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 3 |
| 107 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE2_1:%.*]] = insertelement <4 x i16> [[IE2_0]], i16 [[EE2_1]], i32 1 |
| 108 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE2_2:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 5 |
| 109 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE2_2:%.*]] = insertelement <4 x i16> [[IE2_1]], i16 [[EE2_2]], i32 2 |
| 110 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE2_3:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 7 |
| 111 | + |
| 112 | +; CHECK: {{([0-9]+,[ ]*-512[ ]*).*VS.*[ ]*}} [[IE2_3:%.*]] = insertelement <4 x i16> [[IE2_2]], i16 [[EE2_3]], i32 3 |
| 113 | + |
| 114 | +; both DPAS increase the regpressure by 256 (sub vector of 256 dies, 512 created) |
| 115 | +; CHECK: {{([0-9]+,[ ]*256[ ]*).*[ ]*}} [[DPAS1:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> [[IE1_3]], <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 116 | +; CHECK: {{([0-9]+,[ ]*256[ ]*).*[ ]*}} [[DPAS2:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> [[IE2_3]], <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 117 | +; CHECK: ret void |
| 118 | +; |
| 119 | +entry: |
| 120 | + %base_addr = ptrtoint ptr addrspace(1) %A to i64 |
| 121 | + %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4) |
| 122 | + %EE1.0 = extractelement <8 x i16> %load2d, i32 0 |
| 123 | + %IE1.0 = insertelement <4 x i16> undef, i16 %EE1.0, i32 0 |
| 124 | + %EE1.1 = extractelement <8 x i16> %load2d, i32 2 |
| 125 | + %IE1.1 = insertelement <4 x i16> %IE1.0, i16 %EE1.1, i32 1 |
| 126 | + %EE1.2 = extractelement <8 x i16> %load2d, i32 4 |
| 127 | + %IE1.2 = insertelement <4 x i16> %IE1.1, i16 %EE1.2, i32 2 |
| 128 | + %EE1.3 = extractelement <8 x i16> %load2d, i32 6 |
| 129 | + %IE1.3 = insertelement <4 x i16> %IE1.2, i16 %EE1.3, i32 3 |
| 130 | + %EE2.0 = extractelement <8 x i16> %load2d, i32 1 |
| 131 | + %IE2.0 = insertelement <4 x i16> undef, i16 %EE2.0, i32 0 |
| 132 | + %EE2.1 = extractelement <8 x i16> %load2d, i32 3 |
| 133 | + %IE2.1 = insertelement <4 x i16> %IE2.0, i16 %EE2.1, i32 1 |
| 134 | + %EE2.2 = extractelement <8 x i16> %load2d, i32 5 |
| 135 | + %IE2.2 = insertelement <4 x i16> %IE2.1, i16 %EE2.2, i32 2 |
| 136 | + %EE2.3 = extractelement <8 x i16> %load2d, i32 7 |
| 137 | + %IE2.3 = insertelement <4 x i16> %IE2.2, i16 %EE2.3, i32 3 |
| 138 | + %dpas1 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE1.3, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 139 | + %dpas2 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE2.3, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 140 | + ret void |
| 141 | +} |
| 142 | + |
| 143 | + |
| 144 | +define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) { |
| 145 | +; CHECK: Function coalesced_scalars |
| 146 | + |
| 147 | +; the IE instructions are marked as SCA. First IE adds regpressure |
| 148 | +; then the last usage of the scalar (fadd) kills the hanging values |
| 149 | + |
| 150 | +; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*[ ]*}} [[TMP17:%.*]] = fmul fast float [[TMP9:%.*]], [[TMP1:%.*]] |
| 151 | +; CHECK: {{([0-9]+,[ ]*512[ ]*).*SCA.*[ ]*}} [[TMP19:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP17]], i64 0 |
| 152 | +; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*[ ]*}} [[TMP18:%.*]] = fmul fast float [[TMP10:%.*]], [[TMP2:%.*]] |
| 153 | +; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*SCA.*[ ]*}} [[TMP21:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP18]], i64 1 |
| 154 | +; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*[ ]*}} [[TMP20:%.*]] = fmul fast float [[TMP11:%.*]], [[TMP3:%.*]] |
| 155 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*SCA.*[ ]*}} [[TMP23:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP20]], i64 2 |
| 156 | +; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*[ ]*}} [[TMP22:%.*]] = fmul fast float [[TMP12:%.*]], [[TMP4:%.*]] |
| 157 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*SCA.*[ ]*}} [[TMP25:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP22]], i64 3 |
| 158 | + |
| 159 | +; dpas don't add any regpressure, they reuse the registers of the created vector |
| 160 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*[ ]*}} [[TMP33:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> [[TMP25]], <4 x i16> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) |
| 161 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*[ ]*}} [[TMP34:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> [[TMP33]], <4 x i16> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) |
| 162 | + |
| 163 | +; extract the values from the vector. The extractelement instructions are marked as V2S and don't increase regpressure |
| 164 | +; the vector hangs, so the last EE doesn't reduce regpressure. In this case the EEs are used in PHI nodes so these hanging vals are not killed |
| 165 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[TMP35:%.*]] = extractelement <4 x float> [[TMP34]], i64 0 |
| 166 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[TMP36:%.*]] = extractelement <4 x float> [[TMP34]], i64 1 |
| 167 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 2 |
| 168 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[TMP38:%.*]] = extractelement <4 x float> [[TMP34]], i64 3 |
| 169 | + |
| 170 | +; fadd kills the hanging values from SCA and creates one float -> -384 (3 * 32 * 4) |
| 171 | +; CHECK: {{([0-9]+,[ ]*-384[ ]*).*[ ]*}} [[TMP50:%.*]] = fadd fast float [[TMP37:%.*]], [[TMP22]] |
| 172 | +; this fadd doesn't kill the hanging EEs (they are used in PHI nodes), so it increases regpressure by 128 (32 * 4) |
| 173 | +; CHECK: {{([0-9]+,[ ]*128[ ]*).*[ ]*}} [[TMP51:%.*]] = fadd fast float [[TMP38:%.*]], 4.000000e+00 |
| 174 | + |
| 175 | + entry: |
| 176 | + br label %._crit_edge |
| 177 | + |
| 178 | + ._crit_edge: ; preds = %._crit_edge, %0 |
| 179 | + %1 = phi float [ 0.000000e+00, %entry ], [ %19, %._crit_edge ] |
| 180 | + %2 = phi float [ 0.000000e+00, %entry ], [ %20, %._crit_edge ] |
| 181 | + %3 = phi float [ 0.000000e+00, %entry ], [ %21, %._crit_edge ] |
| 182 | + %4 = phi float [ 0.000000e+00, %entry ], [ %22, %._crit_edge ] |
| 183 | + %5 = call float @llvm.exp2.f32(float 0.000000e+00) |
| 184 | + %6 = call float @llvm.exp2.f32(float 0.000000e+00) |
| 185 | + %7 = call float @llvm.exp2.f32(float 0.000000e+00) |
| 186 | + %8 = call float @llvm.exp2.f32(float 0.000000e+00) |
| 187 | + %9 = fmul fast float %5, %1 |
| 188 | + %10 = fmul fast float %6, %2 |
| 189 | + %11 = fmul fast float %7, %3 |
| 190 | + %12 = fmul fast float %8, %4 |
| 191 | + %13 = insertelement <4 x float> zeroinitializer, float %9, i64 0 |
| 192 | + %14 = insertelement <4 x float> %13, float %10, i64 1 |
| 193 | + %15 = insertelement <4 x float> %14, float %11, i64 2 |
| 194 | + %16 = insertelement <4 x float> %15, float %12, i64 3 |
| 195 | + %17 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> %16, <4 x i16> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) |
| 196 | + %18 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> %17, <4 x i16> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false) |
| 197 | + %19 = extractelement <4 x float> %18, i64 0 |
| 198 | + %20 = extractelement <4 x float> %18, i64 1 |
| 199 | + %21 = extractelement <4 x float> %18, i64 2 |
| 200 | + %22 = extractelement <4 x float> %18, i64 3 |
| 201 | + %23 = fadd fast float %21, %12 |
| 202 | + %24 = fadd fast float %22, 4.000000e+00 |
| 203 | + |
| 204 | + br label %._crit_edge |
| 205 | +} |
| 206 | + |
| 207 | + |
| 208 | +define spir_kernel void @vector_to_scalars_pattern(ptr addrspace(1) %A) { |
| 209 | +; CHECK: Function vector_to_scalars_pattern |
| 210 | + |
| 211 | +; DPAS increases regpressure |
| 212 | +; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}} [[DPAS:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> undef, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 213 | + |
| 214 | +; EE don't increase regpressure |
| 215 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[EE0:%.*]] = extractelement <4 x float> [[DPAS]], i64 0 |
| 216 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[EE1:%.*]] = extractelement <4 x float> [[DPAS]], i64 1 |
| 217 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[EE2:%.*]] = extractelement <4 x float> [[DPAS]], i64 2 |
| 218 | +; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[EE3:%.*]] = extractelement <4 x float> [[DPAS]], i64 3 |
| 219 | + |
| 220 | +; The vector doesn't die on the last EE, it hangs. |
| 221 | +; The uses of the EEs increase regpressure |
| 222 | +; CHECK: {{([0-9]+,[ ]*128[ ]*).*[ ]*}} [[USE0:%.*]] = fadd fast float [[EE0]], 1.0 |
| 223 | +; CHECK: {{([0-9]+,[ ]*128[ ]*).*[ ]*}} [[USE1:%.*]] = fadd fast float [[EE1]], 2.0 |
| 224 | +; CHECK: {{([0-9]+,[ ]*128[ ]*).*[ ]*}} [[USE2:%.*]] = fadd fast float [[EE2]], 3.0 |
| 225 | +; The vector dies on the last EE usage |
| 226 | +; CHECK: {{([0-9]+,[ ]*-384[ ]*).*[ ]*}} [[USE3:%.*]] = fadd fast float [[EE3]], 4.0 |
| 227 | + |
| 228 | +; CHECK: ret void |
| 229 | + |
| 230 | +entry: |
| 231 | + %dpas = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> undef, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false) |
| 232 | + %ee0 = extractelement <4 x float> %dpas, i64 0 |
| 233 | + %ee1 = extractelement <4 x float> %dpas, i64 1 |
| 234 | + %ee2 = extractelement <4 x float> %dpas, i64 2 |
| 235 | + %ee3 = extractelement <4 x float> %dpas, i64 3 |
| 236 | + %use0 = fadd fast float %ee0, 1.0 |
| 237 | + %use1 = fadd fast float %ee1, 2.0 |
| 238 | + %use2 = fadd fast float %ee2, 3.0 |
| 239 | + %use3 = fadd fast float %ee3, 4.0 |
| 240 | + ret void |
| 241 | +} |
| 242 | + |
| 243 | + |
| 244 | + |
| 245 | +declare <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32( |
| 246 | + <4 x float>, <4 x i16>, <4 x i32>, i32, i32, i32, i32, i1) #1 |
| 247 | + |
| 248 | +declare float @llvm.exp2.f32(float) #2 |
| 249 | + |
| 250 | +declare <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1 |
| 251 | +attributes #0 = { convergent nounwind } |
| 252 | +attributes #1 = { convergent nounwind readnone willreturn } |
| 253 | +attributes #2 = { nofree nosync nounwind readnone speculatable willreturn } |
0 commit comments