Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions benchmarks/bench_fft.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,156 @@ BENCHMARK(BM_fft_c2r_1d<float, 64, 500, gt::space::managed>)
BENCHMARK(BM_fft_c2r_1d<double, 64, 500, gt::space::managed>)
->Unit(benchmark::kMillisecond);

// ======================================================================
// BM_fft_r2c_2d
//

template <typename E, int Nx, int Ny, int batch_k,
typename S = gt::space::device>
static void BM_fft_r2c_2d(benchmark::State& state)
{
int batch_size = 1024 * batch_k;
using T = gt::complex<E>;

auto h_A = gt::zeros<E>({Nx, Ny, batch_size});
gt::bm::gtensor2<E, 3, S> d_A(h_A.shape());

auto h_B = gt::empty<T>({Nx / 2 + 1, Ny, batch_size});
gt::bm::gtensor2<T, 3, S> d_B(h_B.shape());

// Set up periodic domain with frequencies 4 and 2
// m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16]
double x, y;
for (int j = 0; j < Ny; j++) {
for (int i = 0; i < Nx; i++) {
x = -2.0 + i / 16.0;
y = j / 16.0;
h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y);
}
}

gt::copy(h_A, d_A);

gt::fft::FFTPlanMany<gt::fft::Domain::REAL, E> plan({Nx, Ny}, batch_size);
std::size_t work_bytes = plan.get_work_buffer_bytes();
std::cout << "plan work bytes: " << work_bytes << std::endl;

auto fn = [&]() {
plan(d_A, d_B);
gt::synchronize();
};

// warm up, device compile
fn();

for (auto _ : state) {
fn();
}
}

BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 200>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 200>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 200>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 200>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 500>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 500>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 500>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 500>)->Unit(benchmark::kMillisecond);

BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 200, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 200, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 200, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 200, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 500, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 500, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 500, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 500, gt::space::managed>)
->Unit(benchmark::kMillisecond);

// Uncomment to benchmark bigger case on large HPC class GPUs
// BENCHMARK(BM_fft_r2c_2d<double, 256, 45, 2, gt::space::managed>)
// ->Unit(benchmark::kMillisecond);

template <typename E, int Nx, int Ny, int batch_k,
typename S = gt::space::device>
static void BM_fft_c2r_2d(benchmark::State& state)
{
int batch_size = 1024 * batch_k;
using T = gt::complex<E>;

auto h_A = gt::zeros<E>({Nx, Ny, batch_size});
gt::bm::gtensor2<E, 3, S> d_A(h_A.shape());

auto h_A2 = gt::zeros<E>(h_A.shape());
gt::bm::gtensor2<E, 3, S> d_A2(h_A.shape());

auto h_B = gt::empty<T>({Nx / 2 + 1, Ny, batch_size});
auto h_B_expected = gt::empty<T>(h_B.shape());
gt::bm::gtensor2<T, 3, S> d_B(h_B.shape());

// Set up periodic domain with frequencies 4 and 2
// m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16]
double x, y;
for (int j = 0; j < Ny; j++) {
for (int i = 0; i < Nx; i++) {
x = -2.0 + i / 16.0;
y = j / 16.0;
h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y);
}
}

gt::copy(h_A, d_A);

gt::fft::FFTPlanMany<gt::fft::Domain::REAL, E> plan({Nx, Ny}, batch_size);
std::size_t work_bytes = plan.get_work_buffer_bytes();
std::cout << "plan work bytes: " << work_bytes << std::endl;

plan(d_A, d_B);

auto fn = [&]() {
plan.inverse(d_B, d_A);
gt::synchronize();
};

// warm up, device compile
fn();

for (auto _ : state) {
fn();
}
}

BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 200>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 200>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 200>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 200>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 500>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 500>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 500>)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 500>)->Unit(benchmark::kMillisecond);

BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 200, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 200, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 200, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 200, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 500, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 500, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 500, gt::space::managed>)
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 500, gt::space::managed>)
->Unit(benchmark::kMillisecond);

BENCHMARK_MAIN();
6 changes: 5 additions & 1 deletion include/gt-fft/backend/sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,11 @@ class FFTPlanManySYCL
work_buffer_bytes_ += plan_work_buffer_bytes;
}
} catch (std::exception const& e) {
std::cerr << "Error creating dft descriptor:" << e.what() << std::endl;
std::cerr << "Error creating dft descriptor size {" << lengths[0];
for (int i = 1; i < lengths.size(); i++) {
std::cerr << ", " << lengths[i];
}
std::cerr << "} batch " << batch_size << ": " << e.what() << std::endl;
abort();
}
}
Expand Down
19 changes: 16 additions & 3 deletions include/gtensor/backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,8 @@ class backend_ops<gt::space::sycl>
case ::sycl::usm::alloc::shared: return memory_type::managed;
case ::sycl::usm::alloc::unknown: return memory_type::unregistered;
default:
fprintf(stderr, "ERROR: unknown memoryType %d.\n", alloc_type);
std::cerr << "ERROR: unknown memory type for pointer " << ptr
<< std::endl;
std::abort();
}
}
Expand Down Expand Up @@ -304,12 +305,24 @@ class backend_ops<gt::space::sycl>
using base_class::base_class;

stream_view() : base_class(gt::backend::sycl::get_queue()) {}
// HACK: allow nullptr to work as default stream like other backends
stream_view(::sycl::queue* p) : base_class(gt::backend::sycl::get_queue())
{
// Interop with e.g. Fortran code that initializes a queue not using
// gtensor interfaces
if (p != nullptr) {
// Note: SYCL uses reference semantics for queues, so a "copy" of the
// queue is really the same queue
this->stream_ = *p;
}
}
stream_view(const stream_view& other) : base_class(other.stream_) {}

stream_view& operator=(const stream_view& other)
{
this->~stream_view();
new (this) stream_view(other);
// Note: SYCL uses reference semantics for queues, so a "copy" of the
// queue is really the same queue
this->stream_ = other.stream_;
return *this;
}

Expand Down
8 changes: 8 additions & 0 deletions src/fortran/gpu_api.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,14 @@ extern "C" int gpuStreamCreate(sycl::queue** pStream)
return 0;
}

extern "C" int gpuStreamCreateAsync(sycl::queue** pStream)
{
// TODO: do we need to change queue properties for this
// to be as close as possible to HIP/CUDA?
*pStream = &gt::backend::sycl::new_stream_queue();
return 0;
}

extern "C" int gpuStreamDestroy(sycl::queue* stream)
{
if (stream != nullptr) {
Expand Down