22#error cl_intel_subgroup_2d_block_io is not supported!
33#endif
44
5- ushort2 __builtin_IB_subgroup_block_read_flat_u16_m2k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
5+ uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_m32k1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
6+
7+ void intel_sub_group_2d_block_read_transpose_32b_32r1x1c (global void * base_address , int width , int height , int pitch , int2 coord , private uint * destination )
8+ {
9+ uint2 temp = __builtin_IB_subgroup_block_read_flat_transpose_u32_m32k1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
10+ destination [0 ] = temp .s0 ;
11+ destination [1 ] = temp .s1 ;
12+ }
613
714__attribute__((intel_reqd_sub_group_size (16 )))
8- kernel void BlockReadTest (global ushort * matrix , int width , int height )
15+ kernel void BlockReadTest (global void * matrix , int bytewidth , int height )
916{
1017 int2 coord = (int2 )(0 , 0 );
11- int bytewidth = width * sizeof (ushort );
1218 int bytepitch = bytewidth ;
1319#if 0
1420 // This is the most basic 2D block read.
@@ -32,6 +38,45 @@ kernel void BlockReadTest(global ushort* matrix, int width, int height)
3238 intel_sub_group_2d_block_read_16b_4r16x1c (matrix , bytewidth , height , bytepitch , coord , data );
3339 printf ("GID %3d: data = %04X %04X %04X %04X\n" , (int )get_global_id (0 ),
3440 data [0 ], data [1 ], data [2 ], data [3 ]);
41+ #elif 1
42+ // This is another multi-row 2D block read.
43+ // Each work-item gets 32 8-bit values, from four different 8 row x 16 column blocks.
44+ // The first 8 8-bit values are the 32 rows from a column of the first block.
45+ // The second 32 8-bit values are the 32 rows from a column of the second block, etc.
46+ uchar data [32 ];
47+ intel_sub_group_2d_block_read_8b_8r16x4c (matrix , bytewidth , height , bytepitch , coord , data );
48+ printf ("GID %3d: data = %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n" ,
49+ (int )get_global_id (0 ),
50+ data [ 0 ], data [ 1 ], data [ 2 ], data [ 3 ], data [ 4 ], data [ 5 ], data [ 6 ], data [ 7 ],
51+ data [ 8 ], data [ 9 ], data [10 ], data [11 ], data [12 ], data [13 ], data [14 ], data [15 ],
52+ data [16 ], data [17 ], data [18 ], data [19 ], data [20 ], data [21 ], data [22 ], data [23 ],
53+ data [24 ], data [25 ], data [26 ], data [27 ], data [28 ], data [29 ], data [30 ], data [31 ]);
54+ #elif 1
55+ // This is another multi-row 2D block read.
56+ // Each work-item gets 128 8-bit values, from four different 32 row x 16 column blocks.
57+ // The first 32 8-bit values are the 32 rows from a column of the first block.
58+ // The second 32 8-bit values are the 32 rows from a column of the second block, etc.
59+ uchar data [128 ];
60+ intel_sub_group_2d_block_read_8b_32r16x4c (matrix , bytewidth , height , bytepitch , coord , data );
61+ printf ("GID %3d: data = %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X ...\n" , (int )get_global_id (0 ),
62+ data [ 0 ], data [ 1 ], data [ 2 ], data [ 3 ], data [ 4 ], data [ 5 ], data [ 6 ], data [ 7 ],
63+ data [ 8 ], data [ 9 ], data [10 ], data [11 ], data [12 ], data [13 ], data [14 ], data [15 ],
64+ data [16 ], data [17 ], data [18 ], data [19 ], data [20 ], data [21 ], data [22 ], data [23 ],
65+ data [24 ], data [25 ], data [26 ], data [27 ], data [28 ], data [29 ], data [30 ], data [31 ],
66+ data [32 ], data [33 ], data [34 ], data [35 ], data [36 ], data [37 ], data [38 ], data [39 ]);
67+ #elif 1
68+ // This is another multi-row 2D block read.
69+ // Each work-item gets 128 8-bit values, from four different 32 row x 16 column blocks.
70+ // The first 32 8-bit values are the 32 rows from a column of the first block.
71+ // The second 32 8-bit values are the 32 rows from a column of the second block, etc.
72+ uchar data [128 ];
73+ intel_sub_group_2d_block_read_8b_32r16x4c (matrix , bytewidth , height , bytepitch , coord , data );
74+ printf ("GID %3d: data = %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X ...\n" , (int )get_global_id (0 ),
75+ data [ 0 ], data [ 1 ], data [ 2 ], data [ 3 ], data [ 4 ], data [ 5 ], data [ 6 ], data [ 7 ],
76+ data [ 8 ], data [ 9 ], data [10 ], data [11 ], data [12 ], data [13 ], data [14 ], data [15 ],
77+ data [16 ], data [17 ], data [18 ], data [19 ], data [20 ], data [21 ], data [22 ], data [23 ],
78+ data [24 ], data [25 ], data [26 ], data [27 ], data [28 ], data [29 ], data [30 ], data [31 ],
79+ data [32 ], data [33 ], data [34 ], data [35 ], data [36 ], data [37 ], data [38 ], data [39 ]);
3580#elif 0
3681 // This is the most basic transposed 2D block read, given that we have not implemented a single-column transposed block read.
3782 // Each work-item gets eight 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).
@@ -40,6 +85,14 @@ kernel void BlockReadTest(global ushort* matrix, int width, int height)
4085 intel_sub_group_2d_block_read_transpose_32b_16r8x1c (matrix , bytewidth , height , bytepitch , coord , data );
4186 printf ("GID %3d: data = %08X %08X %08X %08X %08X %08X %08X %08X ...\n" , (int )get_global_id (0 ),
4287 data [0 ], data [1 ], data [2 ], data [3 ], data [4 ], data [5 ], data [6 ], data [7 ]);
88+ #elif 1
89+ // This is a more complicated transposed 2D block read, since there are 32 rows (pre-transpose) and only 16 work-items.
90+ // Each work-item gets 16 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).
91+ // Each work-item therefore gets 16 columns of data from one matrix row, and 16 columns of data from another matrix row.
92+ // The data from the two matrix rows are interleaved, so there are two columns of data from one row, then two columns from the other row, etc.
93+ uint data [2 ];
94+ intel_sub_group_2d_block_read_transpose_32b_32r1x1c (matrix , bytewidth , height , bytepitch , coord , data );
95+ printf ("GID %3d: data = %08X %08X\n" , (int )get_global_id (0 ), data [0 ], data [1 ]);
4396#elif 1
4497 // This is a more complicated transposed 2D block read, since there are 32 rows (pre-transpose) and only 16 work-items.
4598 // Each work-item gets 16 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).
0 commit comments