diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx index b645cd92..23ea9514 100644 --- a/benchmarks/bench_fft.cxx +++ b/benchmarks/bench_fft.cxx @@ -164,4 +164,156 @@ BENCHMARK(BM_fft_c2r_1d) BENCHMARK(BM_fft_c2r_1d) ->Unit(benchmark::kMillisecond); +// ====================================================================== +// BM_fft_r2c_2d +// + +template +static void BM_fft_r2c_2d(benchmark::State& state) +{ + int batch_size = 1024 * batch_k; + using T = gt::complex; + + auto h_A = gt::zeros({Nx, Ny, batch_size}); + gt::bm::gtensor2 d_A(h_A.shape()); + + auto h_B = gt::empty({Nx / 2 + 1, Ny, batch_size}); + gt::bm::gtensor2 d_B(h_B.shape()); + + // Set up periodic domain with frequencies 4 and 2 + // m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16] + double x, y; + for (int j = 0; j < Ny; j++) { + for (int i = 0; i < Nx; i++) { + x = -2.0 + i / 16.0; + y = j / 16.0; + h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y); + } + } + + gt::copy(h_A, d_A); + + gt::fft::FFTPlanMany plan({Nx, Ny}, batch_size); + std::size_t work_bytes = plan.get_work_buffer_bytes(); + std::cout << "plan work bytes: " << work_bytes << std::endl; + + auto fn = [&]() { + plan(d_A, d_B); + gt::synchronize(); + }; + + // warm up, device compile + fn(); + + for (auto _ : state) { + fn(); + } +} + +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d)->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_r2c_2d) + ->Unit(benchmark::kMillisecond); + +// Uncomment to benchmark bigger case on large HPC class GPUs +// BENCHMARK(BM_fft_r2c_2d) +// ->Unit(benchmark::kMillisecond); + +template +static void BM_fft_c2r_2d(benchmark::State& state) +{ + int batch_size = 1024 * batch_k; + using T = gt::complex; + + auto h_A = gt::zeros({Nx, Ny, batch_size}); + gt::bm::gtensor2 d_A(h_A.shape()); + + auto h_A2 = gt::zeros(h_A.shape()); + gt::bm::gtensor2 d_A2(h_A.shape()); + + auto h_B = gt::empty({Nx / 2 + 1, Ny, batch_size}); + auto h_B_expected = gt::empty(h_B.shape()); + gt::bm::gtensor2 d_B(h_B.shape()); + + // Set up periodic domain with frequencies 4 and 2 + // m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16] + double x, y; + for (int j = 0; j < Ny; j++) { + for (int i = 0; i < Nx; i++) { + x = -2.0 + i / 16.0; + y = j / 16.0; + h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y); + } + } + + gt::copy(h_A, d_A); + + gt::fft::FFTPlanMany plan({Nx, Ny}, batch_size); + std::size_t work_bytes = plan.get_work_buffer_bytes(); + std::cout << "plan work bytes: " << work_bytes << std::endl; + + plan(d_A, d_B); + + auto fn = [&]() { + plan.inverse(d_B, d_A); + gt::synchronize(); + }; + + // warm up, device compile + fn(); + + for (auto _ : state) { + fn(); + } +} + +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d)->Unit(benchmark::kMillisecond); + +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); +BENCHMARK(BM_fft_c2r_2d) + ->Unit(benchmark::kMillisecond); + BENCHMARK_MAIN(); diff --git a/include/gt-fft/backend/sycl.h b/include/gt-fft/backend/sycl.h index ece251cc..138e9f50 100644 --- a/include/gt-fft/backend/sycl.h +++ b/include/gt-fft/backend/sycl.h @@ -270,7 +270,11 @@ class FFTPlanManySYCL work_buffer_bytes_ += plan_work_buffer_bytes; } } catch (std::exception const& e) { - std::cerr << "Error creating dft descriptor:" << e.what() << std::endl; + std::cerr << "Error creating dft descriptor size {" << lengths[0]; + for (int i = 1; i < lengths.size(); i++) { + std::cerr << ", " << lengths[i]; + } + std::cerr << "} batch " << batch_size << ": " << e.what() << std::endl; abort(); } } diff --git a/include/gtensor/backend_sycl.h b/include/gtensor/backend_sycl.h index f23a8827..17de8524 100644 --- a/include/gtensor/backend_sycl.h +++ b/include/gtensor/backend_sycl.h @@ -268,7 +268,8 @@ class backend_ops case ::sycl::usm::alloc::shared: return memory_type::managed; case ::sycl::usm::alloc::unknown: return memory_type::unregistered; default: - fprintf(stderr, "ERROR: unknown memoryType %d.\n", alloc_type); + std::cerr << "ERROR: unknown memory type for pointer " << ptr + << std::endl; std::abort(); } } @@ -304,12 +305,24 @@ class backend_ops using base_class::base_class; stream_view() : base_class(gt::backend::sycl::get_queue()) {} + // HACK: allow nullptr to work as default stream like other backends + stream_view(::sycl::queue* p) : base_class(gt::backend::sycl::get_queue()) + { + // Interop with e.g. Fortran code that initializes a queue not using + // gtensor interfaces + if (p != nullptr) { + // Note: SYCL uses reference semantics for queues, so a "copy" of the + // queue is really the same queue + this->stream_ = *p; + } + } stream_view(const stream_view& other) : base_class(other.stream_) {} stream_view& operator=(const stream_view& other) { - this->~stream_view(); - new (this) stream_view(other); + // Note: SYCL uses reference semantics for queues, so a "copy" of the + // queue is really the same queue + this->stream_ = other.stream_; return *this; } diff --git a/src/fortran/gpu_api.cxx b/src/fortran/gpu_api.cxx index 5b284250..26c63496 100644 --- a/src/fortran/gpu_api.cxx +++ b/src/fortran/gpu_api.cxx @@ -195,6 +195,14 @@ extern "C" int gpuStreamCreate(sycl::queue** pStream) return 0; } +extern "C" int gpuStreamCreateAsync(sycl::queue** pStream) +{ + // TODO: do we need to change queue properties for this + // to be as close as possible to HIP/CUDA? + *pStream = >::backend::sycl::new_stream_queue(); + return 0; +} + extern "C" int gpuStreamDestroy(sycl::queue* stream) { if (stream != nullptr) {