From 5849e43aeecb2faaedf55cb74835a28ede292972 Mon Sep 17 00:00:00 2001 From: Bryce Allen Date: Fri, 8 Dec 2023 14:04:58 +0000 Subject: [PATCH 1/8] bench: add 2d fft benchmarks --- benchmarks/bench_fft.cxx | 150 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx index b645cd92..98eeaecb 100644 --- a/benchmarks/bench_fft.cxx +++ b/benchmarks/bench_fft.cxx @@ -164,4 +164,154 @@ BENCHMARK(BM_fft_c2r_1d) BENCHMARK(BM_fft_c2r_1d) ->Unit(benchmark::kMillisecond); +// ====================================================================== +// BM_fft_r2c_2d +// + +template +static void BM_fft_r2c_2d(benchmark::State& state) +{ + int batch_size = 1024 * batch_k; + using T = gt::complex; + + auto h_A = gt::zeros({Nx, Ny, batch_size}); + gt::bm::gtensor2 d_A(h_A.shape()); + + auto h_A2 = gt::zeros(h_A.shape()); + gt::bm::gtensor2 d_A2(h_A.shape()); + + auto h_B = gt::empty({Nx / 2 + 1, Ny, batch_size}); + auto h_B_expected = gt::empty(h_B.shape()); + gt::bm::gtensor2 d_B(h_B.shape()); + + // Set up periodic domain with frequencies 4 and 2 + // m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16] + double x, y; + for (int j = 0; j < Ny; j++) { + for (int i = 0; i < Nx; i++) { + x = -2.0 + i / 16.0; + y = j / 16.0; + h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y); + } + } + + gt::copy(h_A, d_A); + + gt::fft::FFTPlanMany plan({Nx, Ny}, batch_size); + std::size_t work_bytes = plan.get_work_buffer_bytes(); + std::cout << "plan work bytes: " << work_bytes << std::endl; + + auto fn = [&]() { + plan(d_A, d_B); + gt::synchronize(); + }; + + // warm up, device compile + fn(); + + for (auto _ : state) { + fn(); + } +} + +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); + +template +static void BM_fft_c2r_2d(benchmark::State& state) +{ + int batch_size = 1024 * batch_k; + using T = gt::complex; + + auto h_A = gt::zeros({Nx, Ny, batch_size}); + gt::bm::gtensor2 d_A(h_A.shape()); + + auto h_A2 = gt::zeros(h_A.shape()); + gt::bm::gtensor2 d_A2(h_A.shape()); + + auto h_B = gt::empty({Nx / 2 + 1, Ny, batch_size}); + auto h_B_expected = gt::empty(h_B.shape()); + gt::bm::gtensor2 d_B(h_B.shape()); + + // Set up periodic domain with frequencies 4 and 2 + // m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16] + double x, y; + for (int j = 0; j < Ny; j++) { + for (int i = 0; i < Nx; i++) { + x = -2.0 + i / 16.0; + y = j / 16.0; + h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y); + } + } + + gt::copy(h_A, d_A); + + gt::fft::FFTPlanMany plan({Nx, Ny}, batch_size); + std::size_t work_bytes = plan.get_work_buffer_bytes(); + std::cout << "plan work bytes: " << work_bytes << std::endl; + + plan(d_A, d_B); + + auto fn = [&]() { + plan.inverse(d_B, d_A); + gt::synchronize(); + }; + + // warm up, device compile + fn(); + + for (auto _ : state) { + fn(); + } +} + +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); + BENCHMARK_MAIN(); From adc49fb6a5e9906485de28f7274a375ffd1fd80a Mon Sep 17 00:00:00 2001 From: Bryce Allen Date: Mon, 11 Dec 2023 17:08:55 +0000 Subject: [PATCH 2/8] benchmarks: fft bench cleanup --- benchmarks/bench_fft.cxx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx index 98eeaecb..cdad1022 100644 --- a/benchmarks/bench_fft.cxx +++ b/benchmarks/bench_fft.cxx @@ -177,11 +177,7 @@ static void BM_fft_r2c_2d(benchmark::State& state) auto h_A = gt::zeros({Nx, Ny, batch_size}); gt::bm::gtensor2 d_A(h_A.shape()); - auto h_A2 = gt::zeros(h_A.shape()); - gt::bm::gtensor2 d_A2(h_A.shape()); - auto h_B = gt::empty({Nx / 2 + 1, Ny, batch_size}); - auto h_B_expected = gt::empty(h_B.shape()); gt::bm::gtensor2 d_B(h_B.shape()); // Set up periodic domain with frequencies 4 and 2 @@ -240,6 +236,9 @@ BENCHMARK(BM_fft_r2c_2d) BENCHMARK(BM_fft_r2c_2d) ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); + template static void BM_fft_c2r_2d(benchmark::State& state) { From a96d9e29b3b140799fef2e75000f0ae408958e74 Mon Sep 17 00:00:00 2001 From: Bryce Allen Date: Mon, 11 Dec 2023 17:09:19 +0000 Subject: [PATCH 3/8] fft: better sycl error message on descriptor fail --- include/gt-fft/backend/sycl.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/gt-fft/backend/sycl.h b/include/gt-fft/backend/sycl.h index ece251cc..138e9f50 100644 --- a/include/gt-fft/backend/sycl.h +++ b/include/gt-fft/backend/sycl.h @@ -270,7 +270,11 @@ class FFTPlanManySYCL work_buffer_bytes_ += plan_work_buffer_bytes; } } catch (std::exception const& e) { - std::cerr << "Error creating dft descriptor:" << e.what() << std::endl; + std::cerr << "Error creating dft descriptor size {" << lengths[0]; + for (int i = 1; i < lengths.size(); i++) { + std::cerr << ", " << lengths[i]; + } + std::cerr << "} batch " << batch_size << ": " << e.what() << std::endl; abort(); } } From 64cf498f672fd35726a0eacda9c6af37756c1577 Mon Sep 17 00:00:00 2001 From: Bryce Allen Date: Tue, 12 Dec 2023 12:11:11 -0500 Subject: [PATCH 4/8] clang format --- benchmarks/bench_fft.cxx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx index cdad1022..c08c77ef 100644 --- a/benchmarks/bench_fft.cxx +++ b/benchmarks/bench_fft.cxx @@ -168,7 +168,8 @@ BENCHMARK(BM_fft_c2r_1d) // BM_fft_r2c_2d // -template +template static void BM_fft_r2c_2d(benchmark::State& state) { int batch_size = 1024 * batch_k; @@ -239,7 +240,8 @@ BENCHMARK(BM_fft_r2c_2d) BENCHMARK(BM_fft_r2c_2d) ->Unit(benchmark::kMillisecond); -template +template static void BM_fft_c2r_2d(benchmark::State& state) { int batch_size = 1024 * batch_k; From 5cd18954f0c3a8c97e26d8cfaddaa2cff86b6db3 Mon Sep 17 00:00:00 2001 From: Bryce Allen Date: Tue, 12 Dec 2023 12:47:23 -0500 Subject: [PATCH 5/8] sycl: stream_view ctor for null and pointers Attempt to fix Fortran interop issues when queue is constructed and managed without using gtensor API. --- include/gtensor/backend_sycl.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/include/gtensor/backend_sycl.h b/include/gtensor/backend_sycl.h index f23a8827..c904ddac 100644 --- a/include/gtensor/backend_sycl.h +++ b/include/gtensor/backend_sycl.h @@ -304,12 +304,24 @@ class backend_ops using base_class::base_class; stream_view() : base_class(gt::backend::sycl::get_queue()) {} + // HACK: allow nullptr to work as default stream like other backends + stream_view(::sycl::queue* p) : base_class(gt::backend::sycl::get_queue()) + { + // Interop with e.g. Fortran code that initializes a queue not using + // gtensor interfaces + if (p != nullptr) { + // Note: SYCL uses reference semantics for queues, so a "copy" of the + // queue is really the same queue + this->stream_ = *p; + } + } stream_view(const stream_view& other) : base_class(other.stream_) {} stream_view& operator=(const stream_view& other) { - this->~stream_view(); - new (this) stream_view(other); + // Note: SYCL uses reference semantics for queues, so a "copy" of the + // queue is really the same queue + this->stream_ = other.stream_; return *this; } From 513c353f88008ba6219b3375aeeb2c07f7dfcb2e Mon Sep 17 00:00:00 2001 From: Bryce Allen Date: Fri, 15 Dec 2023 10:28:15 -0500 Subject: [PATCH 6/8] fortran: add missing gpuStreamCreateAsync for sycl Just a copy of gpuStreamCreate, to get this working in GENE --- src/fortran/gpu_api.cxx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/fortran/gpu_api.cxx b/src/fortran/gpu_api.cxx index 5b284250..26c63496 100644 --- a/src/fortran/gpu_api.cxx +++ b/src/fortran/gpu_api.cxx @@ -195,6 +195,14 @@ extern "C" int gpuStreamCreate(sycl::queue** pStream) return 0; } +extern "C" int gpuStreamCreateAsync(sycl::queue** pStream) +{ + // TODO: do we need to change queue properties for this + // to be as close as possible to HIP/CUDA? + *pStream = >::backend::sycl::new_stream_queue(); + return 0; +} + extern "C" int gpuStreamDestroy(sycl::queue* stream) { if (stream != nullptr) { From 87d2a6a31b5934861230ed2afa2491941543afa2 Mon Sep 17 00:00:00 2001 From: Bryce Allen Date: Fri, 15 Dec 2023 12:39:52 -0500 Subject: [PATCH 7/8] sycl: fix memory type err handler fixes compilation on some oneapi versions --- include/gtensor/backend_sycl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/gtensor/backend_sycl.h b/include/gtensor/backend_sycl.h index c904ddac..17de8524 100644 --- a/include/gtensor/backend_sycl.h +++ b/include/gtensor/backend_sycl.h @@ -268,7 +268,8 @@ class backend_ops case ::sycl::usm::alloc::shared: return memory_type::managed; case ::sycl::usm::alloc::unknown: return memory_type::unregistered; default: - fprintf(stderr, "ERROR: unknown memoryType %d.\n", alloc_type); + std::cerr << "ERROR: unknown memory type for pointer " << ptr + << std::endl; std::abort(); } } From 7619e2bbd1e88da13b5a5dc5435191da53d09721 Mon Sep 17 00:00:00 2001 From: Bryce Allen Date: Wed, 20 Dec 2023 11:24:36 -0500 Subject: [PATCH 8/8] benchmarks: comment out large fft benchmark --- benchmarks/bench_fft.cxx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx index c08c77ef..23ea9514 100644 --- a/benchmarks/bench_fft.cxx +++ b/benchmarks/bench_fft.cxx @@ -237,8 +237,9 @@ BENCHMARK(BM_fft_r2c_2d) BENCHMARK(BM_fft_r2c_2d) ->Unit(benchmark::kMillisecond); -BENCHMARK(BM_fft_r2c_2d) - ->Unit(benchmark::kMillisecond); +// Uncomment to benchmark bigger case on large HPC class GPUs +// BENCHMARK(BM_fft_r2c_2d) +// ->Unit(benchmark::kMillisecond); template