Skip to content

Commit 3038acd

Browse files
committed
add more block read varieties
1 parent e8cc364 commit 3038acd

File tree

2 files changed

+73
-8
lines changed

2 files changed

+73
-8
lines changed

samples/99_blockreads/block_read_kernel.cl

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,19 @@
22
#error cl_intel_subgroup_2d_block_io is not supported!
33
#endif
44

5-
ushort2 __builtin_IB_subgroup_block_read_flat_u16_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
5+
uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_m32k1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
6+
7+
void intel_sub_group_2d_block_read_transpose_32b_32r1x1c(global void* base_address, int width, int height, int pitch, int2 coord, private uint* destination)
8+
{
9+
uint2 temp = __builtin_IB_subgroup_block_read_flat_transpose_u32_m32k1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
10+
destination[0] = temp.s0;
11+
destination[1] = temp.s1;
12+
}
613

714
__attribute__((intel_reqd_sub_group_size(16)))
8-
kernel void BlockReadTest(global ushort* matrix, int width, int height)
15+
kernel void BlockReadTest(global void* matrix, int bytewidth, int height)
916
{
1017
int2 coord = (int2)(0, 0);
11-
int bytewidth = width * sizeof(ushort);
1218
int bytepitch = bytewidth;
1319
#if 0
1420
// This is the most basic 2D block read.
@@ -32,6 +38,45 @@ kernel void BlockReadTest(global ushort* matrix, int width, int height)
3238
intel_sub_group_2d_block_read_16b_4r16x1c(matrix, bytewidth, height, bytepitch, coord, data);
3339
printf("GID %3d: data = %04X %04X %04X %04X\n", (int)get_global_id(0),
3440
data[0], data[1], data[2], data[3]);
41+
#elif 1
42+
// This is another multi-row 2D block read.
43+
// Each work-item gets 32 8-bit values, from four different 8 row x 16 column blocks.
44+
// The first 8 8-bit values are the 32 rows from a column of the first block.
45+
// The second 32 8-bit values are the 32 rows from a column of the second block, etc.
46+
uchar data[32];
47+
intel_sub_group_2d_block_read_8b_8r16x4c(matrix, bytewidth, height, bytepitch, coord, data);
48+
printf("GID %3d: data = %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n",
49+
(int)get_global_id(0),
50+
data[ 0], data[ 1], data[ 2], data[ 3], data[ 4], data[ 5], data[ 6], data[ 7],
51+
data[ 8], data[ 9], data[10], data[11], data[12], data[13], data[14], data[15],
52+
data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
53+
data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31]);
54+
#elif 1
55+
// This is another multi-row 2D block read.
56+
// Each work-item gets 128 8-bit values, from four different 32 row x 16 column blocks.
57+
// The first 32 8-bit values are the 32 rows from a column of the first block.
58+
// The second 32 8-bit values are the 32 rows from a column of the second block, etc.
59+
uchar data[128];
60+
intel_sub_group_2d_block_read_8b_32r16x4c(matrix, bytewidth, height, bytepitch, coord, data);
61+
printf("GID %3d: data = %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X ...\n", (int)get_global_id(0),
62+
data[ 0], data[ 1], data[ 2], data[ 3], data[ 4], data[ 5], data[ 6], data[ 7],
63+
data[ 8], data[ 9], data[10], data[11], data[12], data[13], data[14], data[15],
64+
data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
65+
data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
66+
data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39]);
67+
#elif 1
68+
// This is another multi-row 2D block read.
69+
// Each work-item gets 128 8-bit values, from four different 32 row x 16 column blocks.
70+
// The first 32 8-bit values are the 32 rows from a column of the first block.
71+
// The second 32 8-bit values are the 32 rows from a column of the second block, etc.
72+
uchar data[128];
73+
intel_sub_group_2d_block_read_8b_32r16x4c(matrix, bytewidth, height, bytepitch, coord, data);
74+
printf("GID %3d: data = %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X ...\n", (int)get_global_id(0),
75+
data[ 0], data[ 1], data[ 2], data[ 3], data[ 4], data[ 5], data[ 6], data[ 7],
76+
data[ 8], data[ 9], data[10], data[11], data[12], data[13], data[14], data[15],
77+
data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
78+
data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
79+
data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39]);
3580
#elif 0
3681
// This is the most basic transposed 2D block read, given that we have not implemented a single-column transposed block read.
3782
// Each work-item gets eight 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).
@@ -40,6 +85,14 @@ kernel void BlockReadTest(global ushort* matrix, int width, int height)
4085
intel_sub_group_2d_block_read_transpose_32b_16r8x1c(matrix, bytewidth, height, bytepitch, coord, data);
4186
printf("GID %3d: data = %08X %08X %08X %08X %08X %08X %08X %08X ...\n", (int)get_global_id(0),
4287
data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
88+
#elif 1
89+
// This is a more complicated transposed 2D block read, since there are 32 rows (pre-transpose) and only 16 work-items.
90+
// Each work-item gets 16 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).
91+
// Each work-item therefore gets 16 columns of data from one matrix row, and 16 columns of data from another matrix row.
92+
// The data from the two matrix rows are interleaved, so there are two columns of data from one row, then two columns from the other row, etc.
93+
uint data[2];
94+
intel_sub_group_2d_block_read_transpose_32b_32r1x1c(matrix, bytewidth, height, bytepitch, coord, data);
95+
printf("GID %3d: data = %08X %08X\n", (int)get_global_id(0), data[0], data[1]);
4396
#elif 1
4497
// This is a more complicated transposed 2D block read, since there are 32 rows (pre-transpose) and only 16 work-items.
4598
// Each work-item gets 16 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).

samples/99_blockreads/main.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,27 @@ static std::string readStringFromFile(
3535
}
3636

3737
template <typename T>
38-
static void fill_matrix(std::vector<T>& M, size_t numRows, size_t numCols)
38+
void fill_matrix(std::vector<T>& M, size_t numRows, size_t numCols)
3939
{
4040
for (size_t r = 0; r < numRows; r++) {
4141
for (size_t c = 0; c < numCols; c++) {
42-
T value = static_cast<T>(((r % 256) * 256) + (c % 256));
42+
T value = static_cast<T>(((r % 256) * 65536) + (c % 256));
4343
M.push_back(value);
4444
}
4545
}
4646
}
4747

48+
template <>
49+
void fill_matrix(std::vector<uint8_t>& M, size_t numRows, size_t numCols)
50+
{
51+
uint8_t value = 0;
52+
for (size_t r = 0; r < numRows; r++) {
53+
for (size_t c = 0; c < numCols; c++) {
54+
M.push_back(value++);
55+
}
56+
}
57+
}
58+
4859
int main(
4960
int argc,
5061
char** argv )
@@ -124,7 +135,8 @@ int main(
124135
constexpr size_t numRows = 64;
125136
constexpr size_t numCols = 64;
126137

127-
std::vector<uint16_t> matrix;
138+
//std::vector<uint32_t> matrix;
139+
std::vector<uint8_t> matrix;
128140
matrix.reserve(numRows * numCols);
129141
fill_matrix(matrix, numRows, numCols);
130142

@@ -136,8 +148,8 @@ int main(
136148

137149
// execution
138150
kernel.setArg(0, mem);
139-
kernel.setArg(1, static_cast<int>(numRows));
140-
kernel.setArg(2, static_cast<int>(numCols));
151+
kernel.setArg(1, static_cast<int>(numCols * sizeof(matrix[0])));
152+
kernel.setArg(2, static_cast<int>(numRows));
141153
commandQueue.enqueueNDRangeKernel(
142154
kernel,
143155
cl::NullRange,

0 commit comments

Comments
 (0)