Skip to content

Commit 323fb1b

Browse files
authored
Merge pull request #504 from LLNL/feature/burmark1/noop_kernel
Add empty kernel
2 parents 0cea63b + 485f596 commit 323fb1b

15 files changed

+945
-0
lines changed

src/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ blt_add_executable(
9696
basic/DAXPY_ATOMIC.cpp
9797
basic/DAXPY_ATOMIC-Seq.cpp
9898
basic/DAXPY_ATOMIC-OMPTarget.cpp
99+
basic/EMPTY.cpp
100+
basic/EMPTY-Seq.cpp
101+
basic/EMPTY-OMPTarget.cpp
99102
basic/IF_QUAD.cpp
100103
basic/IF_QUAD-Seq.cpp
101104
basic/IF_QUAD-OMPTarget.cpp

src/basic/CMakeLists.txt

+7
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ blt_add_library(
3535
DAXPY_ATOMIC-Cuda.cpp
3636
DAXPY_ATOMIC-OMP.cpp
3737
DAXPY_ATOMIC-OMPTarget.cpp
38+
EMPTY.cpp
39+
EMPTY-Seq.cpp
40+
EMPTY-Hip.cpp
41+
EMPTY-Cuda.cpp
42+
EMPTY-OMP.cpp
43+
EMPTY-OMPTarget.cpp
44+
EMPTY-Sycl.cpp
3845
IF_QUAD.cpp
3946
IF_QUAD-Seq.cpp
4047
IF_QUAD-Hip.cpp

src/basic/EMPTY-Cuda.cpp

+184
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
2+
// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC
3+
// and RAJA Performance Suite project contributors.
4+
// See the RAJAPerf/LICENSE file for details.
5+
//
6+
// SPDX-License-Identifier: (BSD-3-Clause)
7+
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
8+
9+
#include "EMPTY.hpp"
10+
11+
#include "RAJA/RAJA.hpp"
12+
13+
#if defined(RAJA_ENABLE_CUDA)
14+
15+
#include "common/CudaDataUtils.hpp"
16+
17+
#include <iostream>
18+
19+
namespace rajaperf
20+
{
21+
namespace basic
22+
{
23+
24+
template < size_t block_size >
25+
__launch_bounds__(block_size)
26+
__global__ void empty(Index_type iend)
27+
{
28+
Index_type i = blockIdx.x * block_size + threadIdx.x;
29+
if (i < iend) {
30+
EMPTY_BODY;
31+
}
32+
}
33+
34+
template < size_t block_size >
35+
__launch_bounds__(block_size)
36+
__global__ void empty_grid_stride(Index_type iend)
37+
{
38+
Index_type i = blockIdx.x * block_size + threadIdx.x;
39+
Index_type grid_stride = gridDim.x * block_size;
40+
for ( ; i < iend; i += grid_stride) {
41+
EMPTY_BODY;
42+
}
43+
}
44+
45+
46+
template < size_t block_size, typename MappingHelper >
47+
void EMPTY::runCudaVariantImpl(VariantID vid)
48+
{
49+
const Index_type run_reps = getRunReps();
50+
const Index_type ibegin = 0;
51+
const Index_type iend = getActualProblemSize();
52+
53+
auto res{getCudaResource()};
54+
55+
EMPTY_DATA_SETUP;
56+
57+
if ( vid == Base_CUDA ) {
58+
59+
auto func = MappingHelper::direct
60+
? &empty<block_size>
61+
: &empty_grid_stride<block_size>;
62+
63+
constexpr size_t shmem = 0;
64+
const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
65+
MappingHelper, func, block_size, shmem);
66+
67+
startTimer();
68+
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
69+
70+
const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
71+
const size_t grid_size = std::min(normal_grid_size, max_grid_size);
72+
73+
RPlaunchCudaKernel( func,
74+
grid_size, block_size,
75+
shmem, res.get_stream(),
76+
iend );
77+
78+
}
79+
stopTimer();
80+
81+
} else if ( vid == Lambda_CUDA ) {
82+
83+
auto empty_lambda = [=] __device__ (Index_type i) {
84+
EMPTY_BODY;
85+
};
86+
87+
auto func = MappingHelper::direct
88+
? &lambda_cuda_forall<block_size, decltype(empty_lambda)>
89+
: &lambda_cuda_forall_grid_stride<block_size, decltype(empty_lambda)>;
90+
91+
constexpr size_t shmem = 0;
92+
const size_t max_grid_size = RAJAPERF_CUDA_GET_MAX_BLOCKS(
93+
MappingHelper, func, block_size, shmem);
94+
95+
startTimer();
96+
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
97+
98+
const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
99+
const size_t grid_size = std::min(normal_grid_size, max_grid_size);
100+
101+
RPlaunchCudaKernel( func,
102+
grid_size, block_size,
103+
shmem, res.get_stream(),
104+
ibegin, iend, empty_lambda );
105+
106+
}
107+
stopTimer();
108+
109+
} else if ( vid == RAJA_CUDA ) {
110+
111+
using exec_policy = std::conditional_t<MappingHelper::direct,
112+
RAJA::cuda_exec<block_size, true /*async*/>,
113+
RAJA::cuda_exec_occ_calc<block_size, true /*async*/>>;
114+
115+
startTimer();
116+
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
117+
118+
RAJA::forall< exec_policy >( res,
119+
RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
120+
EMPTY_BODY;
121+
});
122+
123+
}
124+
stopTimer();
125+
126+
} else {
127+
getCout() << "\n EMPTY : Unknown Cuda variant id = " << vid << std::endl;
128+
}
129+
}
130+
131+
void EMPTY::runCudaVariant(VariantID vid, size_t tune_idx)
132+
{
133+
size_t t = 0;
134+
135+
seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
136+
137+
if (run_params.numValidGPUBlockSize() == 0u ||
138+
run_params.validGPUBlockSize(block_size)) {
139+
140+
seq_for(gpu_mapping::forall_helpers{}, [&](auto mapping_helper) {
141+
142+
if (tune_idx == t) {
143+
144+
setBlockSize(block_size);
145+
runCudaVariantImpl<decltype(block_size){},
146+
decltype(mapping_helper)>(vid);
147+
148+
}
149+
150+
t += 1;
151+
152+
});
153+
154+
}
155+
156+
});
157+
158+
}
159+
160+
void EMPTY::setCudaTuningDefinitions(VariantID vid)
161+
{
162+
163+
seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
164+
165+
if (run_params.numValidGPUBlockSize() == 0u ||
166+
run_params.validGPUBlockSize(block_size)) {
167+
168+
seq_for(gpu_mapping::forall_helpers{}, [&](auto mapping_helper) {
169+
170+
addVariantTuningName(vid, decltype(mapping_helper)::get_name()+"_"+
171+
std::to_string(block_size));
172+
173+
});
174+
175+
}
176+
177+
});
178+
179+
}
180+
181+
} // end namespace basic
182+
} // end namespace rajaperf
183+
184+
#endif // RAJA_ENABLE_CUDA

src/basic/EMPTY-Hip.cpp

+184
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
2+
// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC
3+
// and RAJA Performance Suite project contributors.
4+
// See the RAJAPerf/LICENSE file for details.
5+
//
6+
// SPDX-License-Identifier: (BSD-3-Clause)
7+
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
8+
9+
#include "EMPTY.hpp"
10+
11+
#include "RAJA/RAJA.hpp"
12+
13+
#if defined(RAJA_ENABLE_HIP)
14+
15+
#include "common/HipDataUtils.hpp"
16+
17+
#include <iostream>
18+
19+
namespace rajaperf
20+
{
21+
namespace basic
22+
{
23+
24+
template < size_t block_size >
25+
__launch_bounds__(block_size)
26+
__global__ void empty(Index_type iend)
27+
{
28+
Index_type i = blockIdx.x * block_size + threadIdx.x;
29+
if (i < iend) {
30+
EMPTY_BODY;
31+
}
32+
}
33+
34+
template < size_t block_size >
35+
__launch_bounds__(block_size)
36+
__global__ void empty_grid_stride(Index_type iend)
37+
{
38+
Index_type i = blockIdx.x * block_size + threadIdx.x;
39+
Index_type grid_stride = gridDim.x * block_size;
40+
for ( ; i < iend; i += grid_stride) {
41+
EMPTY_BODY;
42+
}
43+
}
44+
45+
46+
template < size_t block_size, typename MappingHelper >
47+
void EMPTY::runHipVariantImpl(VariantID vid)
48+
{
49+
const Index_type run_reps = getRunReps();
50+
const Index_type ibegin = 0;
51+
const Index_type iend = getActualProblemSize();
52+
53+
auto res{getHipResource()};
54+
55+
EMPTY_DATA_SETUP;
56+
57+
if ( vid == Base_HIP ) {
58+
59+
auto func = MappingHelper::direct
60+
? &empty<block_size>
61+
: &empty_grid_stride<block_size>;
62+
63+
constexpr size_t shmem = 0;
64+
const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
65+
MappingHelper, func, block_size, shmem);
66+
67+
startTimer();
68+
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
69+
70+
const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
71+
const size_t grid_size = std::min(normal_grid_size, max_grid_size);
72+
73+
RPlaunchHipKernel( func,
74+
grid_size, block_size,
75+
shmem, res.get_stream(),
76+
iend );
77+
78+
}
79+
stopTimer();
80+
81+
} else if ( vid == Lambda_HIP ) {
82+
83+
auto empty_lambda = [=] __device__ (Index_type i) {
84+
EMPTY_BODY;
85+
};
86+
87+
auto func = MappingHelper::direct
88+
? &lambda_hip_forall<block_size, decltype(empty_lambda)>
89+
: &lambda_hip_forall_grid_stride<block_size, decltype(empty_lambda)>;
90+
91+
constexpr size_t shmem = 0;
92+
const size_t max_grid_size = RAJAPERF_HIP_GET_MAX_BLOCKS(
93+
MappingHelper, func, block_size, shmem);
94+
95+
startTimer();
96+
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
97+
98+
const size_t normal_grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
99+
const size_t grid_size = std::min(normal_grid_size, max_grid_size);
100+
101+
RPlaunchHipKernel( func,
102+
grid_size, block_size,
103+
shmem, res.get_stream(),
104+
ibegin, iend, empty_lambda );
105+
106+
}
107+
stopTimer();
108+
109+
} else if ( vid == RAJA_HIP ) {
110+
111+
using exec_policy = std::conditional_t<MappingHelper::direct,
112+
RAJA::hip_exec<block_size, true /*async*/>,
113+
RAJA::hip_exec_occ_calc<block_size, true /*async*/>>;
114+
115+
startTimer();
116+
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
117+
118+
RAJA::forall< exec_policy >( res,
119+
RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
120+
EMPTY_BODY;
121+
});
122+
123+
}
124+
stopTimer();
125+
126+
} else {
127+
getCout() << "\n EMPTY : Unknown Hip variant id = " << vid << std::endl;
128+
}
129+
}
130+
131+
void EMPTY::runHipVariant(VariantID vid, size_t tune_idx)
132+
{
133+
size_t t = 0;
134+
135+
seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
136+
137+
if (run_params.numValidGPUBlockSize() == 0u ||
138+
run_params.validGPUBlockSize(block_size)) {
139+
140+
seq_for(gpu_mapping::forall_helpers{}, [&](auto mapping_helper) {
141+
142+
if (tune_idx == t) {
143+
144+
setBlockSize(block_size);
145+
runHipVariantImpl<decltype(block_size){},
146+
decltype(mapping_helper)>(vid);
147+
148+
}
149+
150+
t += 1;
151+
152+
});
153+
154+
}
155+
156+
});
157+
158+
}
159+
160+
void EMPTY::setHipTuningDefinitions(VariantID vid)
161+
{
162+
163+
seq_for(gpu_block_sizes_type{}, [&](auto block_size) {
164+
165+
if (run_params.numValidGPUBlockSize() == 0u ||
166+
run_params.validGPUBlockSize(block_size)) {
167+
168+
seq_for(gpu_mapping::forall_helpers{}, [&](auto mapping_helper) {
169+
170+
addVariantTuningName(vid, decltype(mapping_helper)::get_name()+"_"+
171+
std::to_string(block_size));
172+
173+
});
174+
175+
}
176+
177+
});
178+
179+
}
180+
181+
} // end namespace basic
182+
} // end namespace rajaperf
183+
184+
#endif // RAJA_ENABLE_HIP

0 commit comments

Comments
 (0)