Skip to content

Commit f7a18fd

Browse files
pkwasnie-inteligcbot
authored andcommitted
add Code Scheduling LIT for SIMD32
Add new LIT for Code Scheduling to test SIMD32 kernels.
1 parent 7682d93 commit f7a18fd

File tree

2 files changed

+256
-1
lines changed

2 files changed

+256
-1
lines changed

IGC/Compiler/tests/CodeScheduling/reg-est-vector-cases.ll renamed to IGC/Compiler/tests/CodeScheduling/reg-est-vector-cases-simd16.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
; REQUIRES: regkeys
1010
; RUN: igc_opt --opaque-pointers --regkey DisableCodeScheduling=0 --regkey EnableCodeSchedulingIfNoSpills=1 \
1111
; RUN: --regkey PrintToConsole=1 --regkey DumpCodeScheduling=1 --igc-code-scheduling \
12-
; RUN: --regkey CodeSchedulingRPThreshold=-512 -S %s 2>&1 | FileCheck %s
12+
; RUN: --regkey CodeSchedulingRPThreshold=-512 \
13+
; RUN: --regkey "CodeSchedulingConfig=10;1;0;30000;100;0;100000;1000;6000;200;10;20;500;50;0;1;1;0;1;1;8;1;32;1;200;64;0;1;20;200;200;5;16;16;34;200;1;0;128;256" \
14+
; RUN: --regkey ForceOCLSIMDWidth=16 -S %s 2>&1 | FileCheck %s
1315

1416

1517
; Checks that the register pressure is estimated correctly for the special cases related to vector shuffles.
Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: regkeys
10+
; RUN: igc_opt --opaque-pointers --regkey DisableCodeScheduling=0 --regkey EnableCodeSchedulingIfNoSpills=1 \
11+
; RUN: --regkey PrintToConsole=1 --regkey DumpCodeScheduling=1 --igc-code-scheduling \
12+
; RUN: --regkey CodeSchedulingRPThreshold=-512 \
13+
; RUN: --regkey "CodeSchedulingConfig=10;1;0;30000;100;0;100000;1000;6000;200;10;20;500;50;0;1;1;0;1;1;8;1;32;1;200;64;0;1;20;200;200;5;16;16;34;200;1;0;128;256" \
14+
; RUN: --regkey ForceOCLSIMDWidth=32 -S %s 2>&1 | FileCheck %s
15+
16+
17+
; Checks that the register pressure is estimated correctly for the special cases related to vector shuffles.
18+
19+
define spir_kernel void @vector_shuffle_no_op(ptr addrspace(1) %A) {
20+
; CHECK: Function vector_shuffle_no_op
21+
; CHECK: Greedy MW attempt
22+
; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}} [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
23+
24+
; (6, 512 ) MW: Node #1, MW: 3000 %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
25+
; adds 512 bytes
26+
; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}} [[LOAD2D:%.*]] = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 [[BASE_ADDR]], i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
27+
28+
; the EE and IE instructions are marked as NOP and don't add regpressure
29+
; (22, 0 ) Im: NOP Node #2, MW: 3000 %EE1.0 = extractelement <8 x i16> %load2d, i32 0
30+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE1_0:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 0
31+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE1_0:%.*]] = insertelement <4 x i16> undef, i16 [[EE1_0]], i32 0
32+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE1_1:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 1
33+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE1_1:%.*]] = insertelement <4 x i16> [[IE1_0]], i16 [[EE1_1]], i32 1
34+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE1_2:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 2
35+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE1_2:%.*]] = insertelement <4 x i16> [[IE1_1]], i16 [[EE1_2]], i32 2
36+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE1_3:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 3
37+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE1_3:%.*]] = insertelement <4 x i16> [[IE1_2]], i16 [[EE1_3]], i32 3
38+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE2_0:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 4
39+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE2_0:%.*]] = insertelement <4 x i16> undef, i16 [[EE2_0]], i32 0
40+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE2_1:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 5
41+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE2_1:%.*]] = insertelement <4 x i16> [[IE2_0]], i16 [[EE2_1]], i32 1
42+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE2_2:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 6
43+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE2_2:%.*]] = insertelement <4 x i16> [[IE2_1]], i16 [[EE2_2]], i32 2
44+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[EE2_3:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 7
45+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*NOP.*[ ]*}} [[IE2_3:%.*]] = insertelement <4 x i16> [[IE2_2]], i16 [[EE2_3]], i32 3
46+
47+
; (22, 512 ) MW: Node #34, MW: 0 %dpas1 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE1.7, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
48+
; (38, 0 ) MW: Node #35, MW: 0 %dpas2 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE2.7, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
49+
; first DPAS increases the regpressure by 512 bytes, the second one doesn't add any regpressure because the whole vector dies
50+
; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}} [[DPAS1:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> [[IE1_3]], <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
51+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*[ ]*}} [[DPAS2:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> [[IE2_3]], <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
52+
53+
; CHECK: ret void
54+
;
55+
entry:
56+
%base_addr = ptrtoint ptr addrspace(1) %A to i64
57+
%load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
58+
%EE1.0 = extractelement <8 x i16> %load2d, i32 0
59+
%IE1.0 = insertelement <4 x i16> undef, i16 %EE1.0, i32 0
60+
%EE1.1 = extractelement <8 x i16> %load2d, i32 1
61+
%IE1.1 = insertelement <4 x i16> %IE1.0, i16 %EE1.1, i32 1
62+
%EE1.2 = extractelement <8 x i16> %load2d, i32 2
63+
%IE1.2 = insertelement <4 x i16> %IE1.1, i16 %EE1.2, i32 2
64+
%EE1.3 = extractelement <8 x i16> %load2d, i32 3
65+
%IE1.3 = insertelement <4 x i16> %IE1.2, i16 %EE1.3, i32 3
66+
67+
%EE2.0 = extractelement <8 x i16> %load2d, i32 4
68+
%IE2.0 = insertelement <4 x i16> undef, i16 %EE2.0, i32 0
69+
%EE2.1 = extractelement <8 x i16> %load2d, i32 5
70+
%IE2.1 = insertelement <4 x i16> %IE2.0, i16 %EE2.1, i32 1
71+
%EE2.2 = extractelement <8 x i16> %load2d, i32 6
72+
%IE2.2 = insertelement <4 x i16> %IE2.1, i16 %EE2.2, i32 2
73+
%EE2.3 = extractelement <8 x i16> %load2d, i32 7
74+
%IE2.3 = insertelement <4 x i16> %IE2.2, i16 %EE2.3, i32 3
75+
76+
%dpas1 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE1.3, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
77+
%dpas2 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE2.3, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
78+
ret void
79+
}
80+
81+
82+
define spir_kernel void @vector_shuffle(ptr addrspace(1) %A) {
83+
; CHECK: Function vector_shuffle
84+
; CHECK: {{([0-9]+,[ ]*[0-9]+[ ]*).*[ ]*}} [[BASE_ADDR:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
85+
86+
; (6, 512 ) MW: Node #1, MW: 3000 %load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
87+
; adds 512 bytes, but we also estimate the regpressure burst from the shuffles, so use 2x (1024 bytes) when making the decision
88+
; CHECK: {{([0-9]+,[ ]*1024[ ]*).*[ ]*}} [[LOAD2D:%.*]] = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 [[BASE_ADDR]], i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
89+
90+
91+
; the EE and IE instructions are marked as VS. IEs add regpressure. The last IE kills the original vector
92+
93+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE1_0:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 0
94+
; CHECK: {{([0-9]+,[ ]*256[ ]*).*VS.*[ ]*}} [[IE1_0:%.*]] = insertelement <4 x i16> undef, i16 [[EE1_0]], i32 0
95+
96+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE1_1:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 2
97+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE1_1:%.*]] = insertelement <4 x i16> [[IE1_0]], i16 [[EE1_1]], i32 1
98+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE1_2:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 4
99+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE1_2:%.*]] = insertelement <4 x i16> [[IE1_1]], i16 [[EE1_2]], i32 2
100+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE1_3:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 6
101+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE1_3:%.*]] = insertelement <4 x i16> [[IE1_2]], i16 [[EE1_3]], i32 3
102+
103+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE2_0:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 1
104+
; CHECK: {{([0-9]+,[ ]*256[ ]*).*VS.*[ ]*}} [[IE2_0:%.*]] = insertelement <4 x i16> undef, i16 [[EE2_0]], i32 0
105+
106+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE2_1:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 3
107+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE2_1:%.*]] = insertelement <4 x i16> [[IE2_0]], i16 [[EE2_1]], i32 1
108+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE2_2:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 5
109+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[IE2_2:%.*]] = insertelement <4 x i16> [[IE2_1]], i16 [[EE2_2]], i32 2
110+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*VS.*[ ]*}} [[EE2_3:%.*]] = extractelement <8 x i16> [[LOAD2D]], i32 7
111+
112+
; CHECK: {{([0-9]+,[ ]*-512[ ]*).*VS.*[ ]*}} [[IE2_3:%.*]] = insertelement <4 x i16> [[IE2_2]], i16 [[EE2_3]], i32 3
113+
114+
; both DPAS increase the regpressure by 256 (sub vector of 256 dies, 512 created)
115+
; CHECK: {{([0-9]+,[ ]*256[ ]*).*[ ]*}} [[DPAS1:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> [[IE1_3]], <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
116+
; CHECK: {{([0-9]+,[ ]*256[ ]*).*[ ]*}} [[DPAS2:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> [[IE2_3]], <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
117+
; CHECK: ret void
118+
;
119+
entry:
120+
%base_addr = ptrtoint ptr addrspace(1) %A to i64
121+
%load2d = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %base_addr, i32 127, i32 1023, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 2, i1 false, i1 false, i32 4)
122+
%EE1.0 = extractelement <8 x i16> %load2d, i32 0
123+
%IE1.0 = insertelement <4 x i16> undef, i16 %EE1.0, i32 0
124+
%EE1.1 = extractelement <8 x i16> %load2d, i32 2
125+
%IE1.1 = insertelement <4 x i16> %IE1.0, i16 %EE1.1, i32 1
126+
%EE1.2 = extractelement <8 x i16> %load2d, i32 4
127+
%IE1.2 = insertelement <4 x i16> %IE1.1, i16 %EE1.2, i32 2
128+
%EE1.3 = extractelement <8 x i16> %load2d, i32 6
129+
%IE1.3 = insertelement <4 x i16> %IE1.2, i16 %EE1.3, i32 3
130+
%EE2.0 = extractelement <8 x i16> %load2d, i32 1
131+
%IE2.0 = insertelement <4 x i16> undef, i16 %EE2.0, i32 0
132+
%EE2.1 = extractelement <8 x i16> %load2d, i32 3
133+
%IE2.1 = insertelement <4 x i16> %IE2.0, i16 %EE2.1, i32 1
134+
%EE2.2 = extractelement <8 x i16> %load2d, i32 5
135+
%IE2.2 = insertelement <4 x i16> %IE2.1, i16 %EE2.2, i32 2
136+
%EE2.3 = extractelement <8 x i16> %load2d, i32 7
137+
%IE2.3 = insertelement <4 x i16> %IE2.2, i16 %EE2.3, i32 3
138+
%dpas1 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE1.3, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
139+
%dpas2 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> %IE2.3, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
140+
ret void
141+
}
142+
143+
144+
define spir_kernel void @coalesced_scalars(ptr addrspace(1) %0) {
145+
; CHECK: Function coalesced_scalars
146+
147+
; the IE instructions are marked as SCA. First IE adds regpressure
148+
; then the last usage of the scalar (fadd) kills the hanging values
149+
150+
; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*[ ]*}} [[TMP17:%.*]] = fmul fast float [[TMP9:%.*]], [[TMP1:%.*]]
151+
; CHECK: {{([0-9]+,[ ]*512[ ]*).*SCA.*[ ]*}} [[TMP19:%.*]] = insertelement <4 x float> zeroinitializer, float [[TMP17]], i64 0
152+
; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*[ ]*}} [[TMP18:%.*]] = fmul fast float [[TMP10:%.*]], [[TMP2:%.*]]
153+
; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*SCA.*[ ]*}} [[TMP21:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP18]], i64 1
154+
; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*[ ]*}} [[TMP20:%.*]] = fmul fast float [[TMP11:%.*]], [[TMP3:%.*]]
155+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*SCA.*[ ]*}} [[TMP23:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP20]], i64 2
156+
; CHECK: {{([0-9]+,[ ]*[0-9-]+[ ]*).*[ ]*}} [[TMP22:%.*]] = fmul fast float [[TMP12:%.*]], [[TMP4:%.*]]
157+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*SCA.*[ ]*}} [[TMP25:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP22]], i64 3
158+
159+
; dpas don't add any regpressure, they reuse the registers of the created vector
160+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*[ ]*}} [[TMP33:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> [[TMP25]], <4 x i16> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
161+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*[ ]*}} [[TMP34:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> [[TMP33]], <4 x i16> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
162+
163+
; extract the values from the vector. The extractelement instructions are marked as V2S and don't increase regpressure
164+
; the vector hangs, so the last EE doesn't reduce regpressure. In this case the EEs are used in PHI nodes so these hanging vals are not killed
165+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[TMP35:%.*]] = extractelement <4 x float> [[TMP34]], i64 0
166+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[TMP36:%.*]] = extractelement <4 x float> [[TMP34]], i64 1
167+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 2
168+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[TMP38:%.*]] = extractelement <4 x float> [[TMP34]], i64 3
169+
170+
; fadd kills the hanging values from SCA and creates one float -> -384 (3 * 32 * 4)
171+
; CHECK: {{([0-9]+,[ ]*-384[ ]*).*[ ]*}} [[TMP50:%.*]] = fadd fast float [[TMP37:%.*]], [[TMP22]]
172+
; this fadd doesn't kill the hanging EEs (they are used in PHI nodes), so it increases regpressure by 128 (32 * 4)
173+
; CHECK: {{([0-9]+,[ ]*128[ ]*).*[ ]*}} [[TMP51:%.*]] = fadd fast float [[TMP38:%.*]], 4.000000e+00
174+
175+
entry:
176+
br label %._crit_edge
177+
178+
._crit_edge: ; preds = %._crit_edge, %0
179+
%1 = phi float [ 0.000000e+00, %entry ], [ %19, %._crit_edge ]
180+
%2 = phi float [ 0.000000e+00, %entry ], [ %20, %._crit_edge ]
181+
%3 = phi float [ 0.000000e+00, %entry ], [ %21, %._crit_edge ]
182+
%4 = phi float [ 0.000000e+00, %entry ], [ %22, %._crit_edge ]
183+
%5 = call float @llvm.exp2.f32(float 0.000000e+00)
184+
%6 = call float @llvm.exp2.f32(float 0.000000e+00)
185+
%7 = call float @llvm.exp2.f32(float 0.000000e+00)
186+
%8 = call float @llvm.exp2.f32(float 0.000000e+00)
187+
%9 = fmul fast float %5, %1
188+
%10 = fmul fast float %6, %2
189+
%11 = fmul fast float %7, %3
190+
%12 = fmul fast float %8, %4
191+
%13 = insertelement <4 x float> zeroinitializer, float %9, i64 0
192+
%14 = insertelement <4 x float> %13, float %10, i64 1
193+
%15 = insertelement <4 x float> %14, float %11, i64 2
194+
%16 = insertelement <4 x float> %15, float %12, i64 3
195+
%17 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> %16, <4 x i16> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
196+
%18 = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> %17, <4 x i16> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i1 false)
197+
%19 = extractelement <4 x float> %18, i64 0
198+
%20 = extractelement <4 x float> %18, i64 1
199+
%21 = extractelement <4 x float> %18, i64 2
200+
%22 = extractelement <4 x float> %18, i64 3
201+
%23 = fadd fast float %21, %12
202+
%24 = fadd fast float %22, 4.000000e+00
203+
204+
br label %._crit_edge
205+
}
206+
207+
208+
define spir_kernel void @vector_to_scalars_pattern(ptr addrspace(1) %A) {
209+
; CHECK: Function vector_to_scalars_pattern
210+
211+
; DPAS increases regpressure
212+
; CHECK: {{([0-9]+,[ ]*512[ ]*).*[ ]*}} [[DPAS:%.*]] = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> undef, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
213+
214+
; EE don't increase regpressure
215+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[EE0:%.*]] = extractelement <4 x float> [[DPAS]], i64 0
216+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[EE1:%.*]] = extractelement <4 x float> [[DPAS]], i64 1
217+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[EE2:%.*]] = extractelement <4 x float> [[DPAS]], i64 2
218+
; CHECK: {{([0-9]+,[ ]*0[ ]*).*V2S.*[ ]*}} [[EE3:%.*]] = extractelement <4 x float> [[DPAS]], i64 3
219+
220+
; The vector doesn't die on the last EE, it hangs.
221+
; The uses of the EEs increase regpressure
222+
; CHECK: {{([0-9]+,[ ]*128[ ]*).*[ ]*}} [[USE0:%.*]] = fadd fast float [[EE0]], 1.0
223+
; CHECK: {{([0-9]+,[ ]*128[ ]*).*[ ]*}} [[USE1:%.*]] = fadd fast float [[EE1]], 2.0
224+
; CHECK: {{([0-9]+,[ ]*128[ ]*).*[ ]*}} [[USE2:%.*]] = fadd fast float [[EE2]], 3.0
225+
; The vector dies on the last EE usage
226+
; CHECK: {{([0-9]+,[ ]*-384[ ]*).*[ ]*}} [[USE3:%.*]] = fadd fast float [[EE3]], 4.0
227+
228+
; CHECK: ret void
229+
230+
entry:
231+
%dpas = call <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(<4 x float> zeroinitializer, <4 x i16> undef, <4 x i32> zeroinitializer, i32 1, i32 1, i32 1, i32 1, i1 false)
232+
%ee0 = extractelement <4 x float> %dpas, i64 0
233+
%ee1 = extractelement <4 x float> %dpas, i64 1
234+
%ee2 = extractelement <4 x float> %dpas, i64 2
235+
%ee3 = extractelement <4 x float> %dpas, i64 3
236+
%use0 = fadd fast float %ee0, 1.0
237+
%use1 = fadd fast float %ee1, 2.0
238+
%use2 = fadd fast float %ee2, 3.0
239+
%use3 = fadd fast float %ee3, 4.0
240+
ret void
241+
}
242+
243+
244+
245+
declare <4 x float> @llvm.genx.GenISA.sub.group.dpas.v4f32.v4f32.v4i16.v4i32(
246+
<4 x float>, <4 x i16>, <4 x i32>, i32, i32, i32, i32, i1) #1
247+
248+
declare float @llvm.exp2.f32(float) #2
249+
250+
declare <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1
251+
attributes #0 = { convergent nounwind }
252+
attributes #1 = { convergent nounwind readnone willreturn }
253+
attributes #2 = { nofree nosync nounwind readnone speculatable willreturn }

0 commit comments

Comments
 (0)