Skip to content

Commit e8cc364

Browse files
committed
add a tester for block reads
1 parent d56380b commit e8cc364

File tree

4 files changed

+218
-0
lines changed

4 files changed

+218
-0
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Copyright (c) 2025 Ben Ashbaugh
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
add_opencl_sample(
6+
TEST
7+
NUMBER 99
8+
TARGET blockreads
9+
VERSION 120
10+
SOURCES main.cpp
11+
KERNELS block_read_kernel.cl)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#if !defined(cl_intel_subgroup_2d_block_io)
2+
#error cl_intel_subgroup_2d_block_io is not supported!
3+
#endif
4+
5+
ushort2 __builtin_IB_subgroup_block_read_flat_u16_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
6+
7+
__attribute__((intel_reqd_sub_group_size(16)))
8+
kernel void BlockReadTest(global ushort* matrix, int width, int height)
9+
{
10+
int2 coord = (int2)(0, 0);
11+
int bytewidth = width * sizeof(ushort);
12+
int bytepitch = bytewidth;
13+
#if 0
14+
// This is the most basic 2D block read.
15+
// Each work-item gets one 16-bit value, from a single row.
16+
ushort data[1];
17+
intel_sub_group_2d_block_read_16b_1r16x1c(matrix, bytewidth, height, bytepitch, coord, data);
18+
printf("GID %3d: data = %04X\n", (int)get_global_id(0),
19+
data[0]);
20+
#elif 0
21+
// This is a multi-row 2D block read.
22+
// Each work-item gets two 16-bit values, one from the first row, and one from the second row.
23+
ushort data[2];
24+
intel_sub_group_2d_block_read_16b_2r16x1c(matrix, bytewidth, height, bytepitch, coord, data);
25+
printf("GID %3d: data = %04X %04X\n", (int)get_global_id(0),
26+
data[0], data[1]);
27+
#elif 0
28+
// This is another multi-row 2D block read.
29+
// Each work-item gets four 16-bit values, one from the first row, and one from the second row, etc.
30+
// Each work-item therefore gets four rows of data from the same matrix column.
31+
ushort data[4];
32+
intel_sub_group_2d_block_read_16b_4r16x1c(matrix, bytewidth, height, bytepitch, coord, data);
33+
printf("GID %3d: data = %04X %04X %04X %04X\n", (int)get_global_id(0),
34+
data[0], data[1], data[2], data[3]);
35+
#elif 0
36+
// This is the most basic transposed 2D block read, given that we have not implemented a single-column transposed block read.
37+
// Each work-item gets eight 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).
38+
// Each work-item therefore gets 16 columns of data from the same matrix row.
39+
uint data[8];
40+
intel_sub_group_2d_block_read_transpose_32b_16r8x1c(matrix, bytewidth, height, bytepitch, coord, data);
41+
printf("GID %3d: data = %08X %08X %08X %08X %08X %08X %08X %08X ...\n", (int)get_global_id(0),
42+
data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
43+
#elif 1
44+
// This is a more complicated transposed 2D block read, since there are 32 rows (pre-transpose) and only 16 work-items.
45+
// Each work-item gets 16 32-bit values, where each 32-bit value contains two columns of data (pre-transpose).
46+
// Each work-item therefore gets 16 columns of data from one matrix row, and 16 columns of data from another matrix row.
47+
// The data from the two matrix rows are interleaved, so there are two columns of data from one row, then two columns from the other row, etc.
48+
uint data[16];
49+
intel_sub_group_2d_block_read_transpose_32b_32r8x1c(matrix, bytewidth, height, bytepitch, coord, data);
50+
printf("GID %3d: data = %08X %08X %08X %08X %08X %08X %08X %08X ...\n", (int)get_global_id(0),
51+
data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
52+
#endif
53+
}

samples/99_blockreads/main.cpp

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/*
2+
// Copyright (c) 2019-2025 Ben Ashbaugh
3+
//
4+
// SPDX-License-Identifier: MIT
5+
*/
6+
7+
#include <popl/popl.hpp>
8+
9+
#include <CL/opencl.hpp>
10+
11+
#include <fstream>
12+
#include <string>
13+
14+
#include "util.hpp"
15+
16+
static std::string readStringFromFile(
17+
const std::string& filename )
18+
{
19+
std::ifstream is(filename, std::ios::binary);
20+
if (!is.good()) {
21+
printf("Couldn't open file '%s'!\n", filename.c_str());
22+
return "";
23+
}
24+
25+
size_t filesize = 0;
26+
is.seekg(0, std::ios::end);
27+
filesize = (size_t)is.tellg();
28+
is.seekg(0, std::ios::beg);
29+
30+
std::string source{
31+
std::istreambuf_iterator<char>(is),
32+
std::istreambuf_iterator<char>() };
33+
34+
return source;
35+
}
36+
37+
template <typename T>
38+
static void fill_matrix(std::vector<T>& M, size_t numRows, size_t numCols)
39+
{
40+
for (size_t r = 0; r < numRows; r++) {
41+
for (size_t c = 0; c < numCols; c++) {
42+
T value = static_cast<T>(((r % 256) * 256) + (c % 256));
43+
M.push_back(value);
44+
}
45+
}
46+
}
47+
48+
int main(
49+
int argc,
50+
char** argv )
51+
{
52+
int platformIndex = 0;
53+
int deviceIndex = 0;
54+
55+
std::string fileName("block_read_kernel.cl");
56+
std::string kernelName("BlockReadTest");
57+
std::string buildOptions;
58+
size_t gwx = 512;
59+
60+
{
61+
popl::OptionParser op("Supported Options");
62+
op.add<popl::Value<int>>("p", "platform", "Platform Index", platformIndex, &platformIndex);
63+
op.add<popl::Value<int>>("d", "device", "Device Index", deviceIndex, &deviceIndex);
64+
op.add<popl::Value<std::string>>("", "file", "Kernel File Name", fileName, &fileName);
65+
op.add<popl::Value<std::string>>("", "name", "Kernel Name", kernelName, &kernelName);
66+
op.add<popl::Value<std::string>>("", "options", "Program Build Options", buildOptions, &buildOptions);
67+
op.add<popl::Value<size_t>>("", "gwx", "Global Work Size", gwx, &gwx);
68+
bool printUsage = false;
69+
try {
70+
op.parse(argc, argv);
71+
} catch (std::exception& e) {
72+
fprintf(stderr, "Error: %s\n\n", e.what());
73+
printUsage = true;
74+
}
75+
76+
if (printUsage || !op.unknown_options().empty() || !op.non_option_args().empty()) {
77+
fprintf(stderr,
78+
"Usage: blockreads [options]\n"
79+
"%s", op.help().c_str());
80+
return -1;
81+
}
82+
}
83+
84+
std::vector<cl::Platform> platforms;
85+
cl::Platform::get(&platforms);
86+
87+
printf("Running on platform: %s\n",
88+
platforms[platformIndex].getInfo<CL_PLATFORM_NAME>().c_str() );
89+
90+
std::vector<cl::Device> devices;
91+
platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
92+
93+
printf("Running on device: %s\n",
94+
devices[deviceIndex].getInfo<CL_DEVICE_NAME>().c_str() );
95+
96+
bool has_cl_intel_subgroup_2d_block_io =
97+
checkDeviceForExtension(devices[deviceIndex], "cl_intel_subgroup_2d_block_io");
98+
if (has_cl_intel_subgroup_2d_block_io) {
99+
printf("Device supports cl_intel_subgroup_2d_block_io.\n");
100+
} else {
101+
printf("Device does not support cl_intel_subgroup_2d_block_io, exiting.\n");
102+
return -1;
103+
}
104+
cl::Context context{devices[deviceIndex]};
105+
cl::CommandQueue commandQueue{context, devices[deviceIndex]};
106+
107+
printf("Reading program source from file: %s\n", fileName.c_str() );
108+
std::string kernelString = readStringFromFile(fileName.c_str());
109+
110+
printf("Building program with build options: %s\n",
111+
buildOptions.empty() ? "(none)" : buildOptions.c_str() );
112+
cl::Program program{ context, kernelString };
113+
program.build(buildOptions.c_str());
114+
for( auto& device : program.getInfo<CL_PROGRAM_DEVICES>() )
115+
{
116+
printf("Program build log for device %s:\n",
117+
device.getInfo<CL_DEVICE_NAME>().c_str() );
118+
printf("%s\n",
119+
program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str() );
120+
}
121+
printf("Creating kernel: %s\n", kernelName.c_str() );
122+
cl::Kernel kernel = cl::Kernel{ program, kernelName.c_str() };
123+
124+
constexpr size_t numRows = 64;
125+
constexpr size_t numCols = 64;
126+
127+
std::vector<uint16_t> matrix;
128+
matrix.reserve(numRows * numCols);
129+
fill_matrix(matrix, numRows, numCols);
130+
131+
cl::Buffer mem = cl::Buffer{
132+
context,
133+
CL_MEM_COPY_HOST_PTR,
134+
matrix.size() * sizeof(matrix[0]),
135+
matrix.data() };
136+
137+
// execution
138+
kernel.setArg(0, mem);
139+
kernel.setArg(1, static_cast<int>(numRows));
140+
kernel.setArg(2, static_cast<int>(numCols));
141+
commandQueue.enqueueNDRangeKernel(
142+
kernel,
143+
cl::NullRange,
144+
cl::NDRange{16},
145+
cl::NDRange{16} );
146+
147+
commandQueue.finish();
148+
149+
printf("Done.\n");
150+
151+
return 0;
152+
}

samples/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,5 @@ if(BUILD_EXTENSION_SAMPLES)
9191
add_subdirectory( 14_ooqcommandbuffers )
9292
add_subdirectory( 15_mutablecommandbufferasserts )
9393
endif()
94+
95+
add_subdirectory( 99_blockreads )

0 commit comments

Comments
 (0)