@@ -11,6 +11,32 @@ void intel_sub_group_2d_block_read_transpose_32b_32r1x1c(global void* base_addre
1111 destination [1 ] = temp .s1 ;
1212}
1313
14+ void __spirv_Subgroup2DBlockLoadINTEL (
15+ int element_size , int block_width , int block_height , int block_count ,
16+ const __global void * src_base_pointer , int memory_width , int memory_height , int memory_pitch ,
17+ int2 coordinate ,
18+ private void * dst_pointer );
19+ void __spirv_Subgroup2DBlockLoadTransposeINTEL (
20+ int element_size , int block_width , int block_height , int block_count ,
21+ const __global void * src_base_pointer , int memory_width , int memory_height , int memory_pitch ,
22+ int2 coordinate ,
23+ private void * dst_pointer );
24+ void __spirv_Subgroup2DBlockLoadTransformINTEL (
25+ int element_size , int block_width , int block_height , int block_count ,
26+ const __global void * src_base_pointer , int memory_width , int memory_height , int memory_pitch ,
27+ int2 coordinate ,
28+ private void * dst_pointer );
29+ void __spirv_Subgroup2DBlockPrefetchINTEL (
30+ int element_size , int block_width , int block_height , int block_count ,
31+ const __global void * src_base_pointer , int memory_width , int memory_height , int memory_pitch ,
32+ int2 coordinate );
33+ void __spirv_Subgroup2DBlockStoreINTEL (
34+ int element_size , int block_width , int block_height , int block_count ,
35+ const private void * src_pointer ,
36+ __global void * dst_base_pointer ,
37+ int memory_width , int memory_height , int memory_pitch ,
38+ int2 coordinate );
39+
1440__attribute__((intel_reqd_sub_group_size (16 )))
1541kernel void BlockReadTest (global void * matrix , int bytewidth , int height )
1642{
@@ -38,7 +64,7 @@ kernel void BlockReadTest(global void* matrix, int bytewidth, int height)
3864 intel_sub_group_2d_block_read_16b_4r16x1c (matrix , bytewidth , height , bytepitch , coord , data );
3965 printf ("GID %3d: data = %04X %04X %04X %04X\n" , (int )get_global_id (0 ),
4066 data [0 ], data [1 ], data [2 ], data [3 ]);
41- #elif 1
67+ #elif 0
4268 // This is another multi-row 2D block read.
4369 // Each work-item gets 32 8-bit values, from four different 8 row x 16 column blocks.
4470 // The first 8 8-bit values are the 32 rows from a column of the first block.
@@ -51,7 +77,7 @@ kernel void BlockReadTest(global void* matrix, int bytewidth, int height)
5177 data [ 8 ], data [ 9 ], data [10 ], data [11 ], data [12 ], data [13 ], data [14 ], data [15 ],
5278 data [16 ], data [17 ], data [18 ], data [19 ], data [20 ], data [21 ], data [22 ], data [23 ],
5379 data [24 ], data [25 ], data [26 ], data [27 ], data [28 ], data [29 ], data [30 ], data [31 ]);
54- #elif 1
80+ #elif 0
5581 // This is another multi-row 2D block read.
5682 // Each work-item gets 128 8-bit values, from four different 32 row x 16 column blocks.
5783 // The first 32 8-bit values are the 32 rows from a column of the first block.
@@ -64,7 +90,7 @@ kernel void BlockReadTest(global void* matrix, int bytewidth, int height)
6490 data [16 ], data [17 ], data [18 ], data [19 ], data [20 ], data [21 ], data [22 ], data [23 ],
6591 data [24 ], data [25 ], data [26 ], data [27 ], data [28 ], data [29 ], data [30 ], data [31 ],
6692 data [32 ], data [33 ], data [34 ], data [35 ], data [36 ], data [37 ], data [38 ], data [39 ]);
67- #elif 1
93+ #elif 0
6894 // This is another multi-row 2D block read.
6995 // Each work-item gets 128 8-bit values, from four different 32 row x 16 column blocks.
7096 // The first 32 8-bit values are the 32 rows from a column of the first block.
@@ -85,15 +111,15 @@ kernel void BlockReadTest(global void* matrix, int bytewidth, int height)
85111 intel_sub_group_2d_block_read_transpose_32b_16r8x1c (matrix , bytewidth , height , bytepitch , coord , data );
86112 printf ("GID %3d: data = %08X %08X %08X %08X %08X %08X %08X %08X ...\n" , (int )get_global_id (0 ),
87113 data [0 ], data [1 ], data [2 ], data [3 ], data [4 ], data [5 ], data [6 ], data [7 ]);
88- #elif 1
114+ #elif 0
89115 // This is a more complicated transposed 2D block read, since there are 32 rows (pre-transpose) and only 16 work-items.
90116 // Each work-item gets 16 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).
91117 // Each work-item therefore gets 16 columns of data from one matrix row, and 16 columns of data from another matrix row.
92118 // The data from the two matrix rows are interleaved, so there are two columns of data from one row, then two columns from the other row, etc.
93119 uint data [2 ];
94120 intel_sub_group_2d_block_read_transpose_32b_32r1x1c (matrix , bytewidth , height , bytepitch , coord , data );
95121 printf ("GID %3d: data = %08X %08X\n" , (int )get_global_id (0 ), data [0 ], data [1 ]);
96- #elif 1
122+ #elif 0
97123 // This is a more complicated transposed 2D block read, since there are 32 rows (pre-transpose) and only 16 work-items.
98124 // Each work-item gets 16 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).
99125 // Each work-item therefore gets 16 columns of data from one matrix row, and 16 columns of data from another matrix row.
@@ -102,5 +128,14 @@ kernel void BlockReadTest(global void* matrix, int bytewidth, int height)
102128 intel_sub_group_2d_block_read_transpose_32b_32r8x1c (matrix , bytewidth , height , bytepitch , coord , data );
103129 printf ("GID %3d: data = %08X %08X %08X %08X %08X %08X %08X %08X ...\n" , (int )get_global_id (0 ),
104130 data [0 ], data [1 ], data [2 ], data [3 ], data [4 ], data [5 ], data [6 ], data [7 ]);
131+ #elif 1
132+ // This is a multi-row, multi-block 2D block read.
133+ // Each work-item gets four 8-bit values, from two different 2 row x 16 column blocks.
134+ // The first two 8-bit values are the two rows from a column of the first block.
135+ // The second two 8-bit values are the two rows from a column of the second block.
136+ uchar data [4 ];
137+ __spirv_Subgroup2DBlockLoadINTEL (1 , 16 , 2 , 2 , matrix , bytewidth , height , bytepitch , coord , data );
138+ printf ("GID %3d: data = %02X %02X %02X %02X\n" , (int )get_global_id (0 ),
139+ data [ 0 ], data [ 1 ], data [ 2 ], data [ 3 ]);
105140#endif
106141}
0 commit comments