From 5849e43aeecb2faaedf55cb74835a28ede292972 Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Fri, 8 Dec 2023 14:04:58 +0000
Subject: [PATCH 1/8] bench: add 2d fft benchmarks

---
 benchmarks/bench_fft.cxx | 150 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx
index b645cd92..98eeaecb 100644
--- a/benchmarks/bench_fft.cxx
+++ b/benchmarks/bench_fft.cxx
@@ -164,4 +164,154 @@ BENCHMARK(BM_fft_c2r_1d<float, 64, 500, gt::space::managed>)
 BENCHMARK(BM_fft_c2r_1d<double, 64, 500, gt::space::managed>)
   ->Unit(benchmark::kMillisecond);
 
+// ======================================================================
+// BM_fft_r2c_2d
+//
+
+template <typename E, int Nx, int Ny, int batch_k, typename S = gt::space::device>
+static void BM_fft_r2c_2d(benchmark::State& state)
+{
+  int batch_size = 1024 * batch_k;
+  using T = gt::complex<E>;
+
+  auto h_A = gt::zeros<E>({Nx, Ny, batch_size});
+  gt::bm::gtensor2<E, 3, S> d_A(h_A.shape());
+
+  auto h_A2 = gt::zeros<E>(h_A.shape());
+  gt::bm::gtensor2<E, 3, S> d_A2(h_A.shape());
+
+  auto h_B = gt::empty<T>({Nx / 2 + 1, Ny, batch_size});
+  auto h_B_expected = gt::empty<T>(h_B.shape());
+  gt::bm::gtensor2<T, 3, S> d_B(h_B.shape());
+
+  // Set up periodic domain with frequencies 4 and 2
+  // m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16]
+  double x, y;
+  for (int j = 0; j < Ny; j++) {
+    for (int i = 0; i < Nx; i++) {
+      x = -2.0 + i / 16.0;
+      y = j / 16.0;
+      h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y);
+    }
+  }
+
+  gt::copy(h_A, d_A);
+
+  gt::fft::FFTPlanMany<gt::fft::Domain::REAL, E> plan({Nx, Ny}, batch_size);
+  std::size_t work_bytes = plan.get_work_buffer_bytes();
+  std::cout << "plan work bytes: " << work_bytes << std::endl;
+
+  auto fn = [&]() {
+    plan(d_A, d_B);
+    gt::synchronize();
+  };
+
+  // warm up, device compile
+  fn();
+
+  for (auto _ : state) {
+    fn();
+  }
+}
+
+BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 500>)->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+
+template <typename E, int Nx, int Ny, int batch_k, typename S = gt::space::device>
+static void BM_fft_c2r_2d(benchmark::State& state)
+{
+  int batch_size = 1024 * batch_k;
+  using T = gt::complex<E>;
+
+  auto h_A = gt::zeros<E>({Nx, Ny, batch_size});
+  gt::bm::gtensor2<E, 3, S> d_A(h_A.shape());
+
+  auto h_A2 = gt::zeros<E>(h_A.shape());
+  gt::bm::gtensor2<E, 3, S> d_A2(h_A.shape());
+
+  auto h_B = gt::empty<T>({Nx / 2 + 1, Ny, batch_size});
+  auto h_B_expected = gt::empty<T>(h_B.shape());
+  gt::bm::gtensor2<T, 3, S> d_B(h_B.shape());
+
+  // Set up periodic domain with frequencies 4 and 2
+  // m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16]
+  double x, y;
+  for (int j = 0; j < Ny; j++) {
+    for (int i = 0; i < Nx; i++) {
+      x = -2.0 + i / 16.0;
+      y = j / 16.0;
+      h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y);
+    }
+  }
+
+  gt::copy(h_A, d_A);
+
+  gt::fft::FFTPlanMany<gt::fft::Domain::REAL, E> plan({Nx, Ny}, batch_size);
+  std::size_t work_bytes = plan.get_work_buffer_bytes();
+  std::cout << "plan work bytes: " << work_bytes << std::endl;
+
+  plan(d_A, d_B);
+
+  auto fn = [&]() {
+    plan.inverse(d_B, d_A);
+    gt::synchronize();
+  };
+
+  // warm up, device compile
+  fn();
+
+  for (auto _ : state) {
+    fn();
+  }
+}
+
+BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 500>)->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+
 BENCHMARK_MAIN();

From adc49fb6a5e9906485de28f7274a375ffd1fd80a Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Mon, 11 Dec 2023 17:08:55 +0000
Subject: [PATCH 2/8] benchmarks: fft bench cleanup

---
 benchmarks/bench_fft.cxx | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx
index 98eeaecb..cdad1022 100644
--- a/benchmarks/bench_fft.cxx
+++ b/benchmarks/bench_fft.cxx
@@ -177,11 +177,7 @@ static void BM_fft_r2c_2d(benchmark::State& state)
   auto h_A = gt::zeros<E>({Nx, Ny, batch_size});
   gt::bm::gtensor2<E, 3, S> d_A(h_A.shape());
 
-  auto h_A2 = gt::zeros<E>(h_A.shape());
-  gt::bm::gtensor2<E, 3, S> d_A2(h_A.shape());
-
   auto h_B = gt::empty<T>({Nx / 2 + 1, Ny, batch_size});
-  auto h_B_expected = gt::empty<T>(h_B.shape());
   gt::bm::gtensor2<T, 3, S> d_B(h_B.shape());
 
   // Set up periodic domain with frequencies 4 and 2
@@ -240,6 +236,9 @@ BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 500, gt::space::managed>)
 BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 500, gt::space::managed>)
   ->Unit(benchmark::kMillisecond);
 
+BENCHMARK(BM_fft_r2c_2d<double, 256, 45, 2, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+
 template <typename E, int Nx, int Ny, int batch_k, typename S = gt::space::device>
 static void BM_fft_c2r_2d(benchmark::State& state)
 {

From a96d9e29b3b140799fef2e75000f0ae408958e74 Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Mon, 11 Dec 2023 17:09:19 +0000
Subject: [PATCH 3/8] fft: better sycl error message on descriptor fail

---
 include/gt-fft/backend/sycl.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/gt-fft/backend/sycl.h b/include/gt-fft/backend/sycl.h
index ece251cc..138e9f50 100644
--- a/include/gt-fft/backend/sycl.h
+++ b/include/gt-fft/backend/sycl.h
@@ -270,7 +270,11 @@ class FFTPlanManySYCL
         work_buffer_bytes_ += plan_work_buffer_bytes;
       }
     } catch (std::exception const& e) {
-      std::cerr << "Error creating dft descriptor:" << e.what() << std::endl;
+      std::cerr << "Error creating dft descriptor size {" << lengths[0];
+      for (int i = 1; i < lengths.size(); i++) {
+        std::cerr << ", " << lengths[i];
+      }
+      std::cerr << "} batch " << batch_size << ": " << e.what() << std::endl;
       abort();
     }
   }

From 64cf498f672fd35726a0eacda9c6af37756c1577 Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Tue, 12 Dec 2023 12:11:11 -0500
Subject: [PATCH 4/8] clang format

---
 benchmarks/bench_fft.cxx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx
index cdad1022..c08c77ef 100644
--- a/benchmarks/bench_fft.cxx
+++ b/benchmarks/bench_fft.cxx
@@ -168,7 +168,8 @@ BENCHMARK(BM_fft_c2r_1d<double, 64, 500, gt::space::managed>)
 // BM_fft_r2c_2d
 //
 
-template <typename E, int Nx, int Ny, int batch_k, typename S = gt::space::device>
+template <typename E, int Nx, int Ny, int batch_k,
+          typename S = gt::space::device>
 static void BM_fft_r2c_2d(benchmark::State& state)
 {
   int batch_size = 1024 * batch_k;
@@ -239,7 +240,8 @@ BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 500, gt::space::managed>)
 BENCHMARK(BM_fft_r2c_2d<double, 256, 45, 2, gt::space::managed>)
   ->Unit(benchmark::kMillisecond);
 
-template <typename E, int Nx, int Ny, int batch_k, typename S = gt::space::device>
+template <typename E, int Nx, int Ny, int batch_k,
+          typename S = gt::space::device>
 static void BM_fft_c2r_2d(benchmark::State& state)
 {
   int batch_size = 1024 * batch_k;

From 5cd18954f0c3a8c97e26d8cfaddaa2cff86b6db3 Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Tue, 12 Dec 2023 12:47:23 -0500
Subject: [PATCH 5/8] sycl: stream_view ctor for null and pointers

Attempt to fix Fortran interop issues when queue is constructed and
managed without using gtensor API.
---
 include/gtensor/backend_sycl.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/gtensor/backend_sycl.h b/include/gtensor/backend_sycl.h
index f23a8827..c904ddac 100644
--- a/include/gtensor/backend_sycl.h
+++ b/include/gtensor/backend_sycl.h
@@ -304,12 +304,24 @@ class backend_ops<gt::space::sycl>
     using base_class::base_class;
 
     stream_view() : base_class(gt::backend::sycl::get_queue()) {}
+    // HACK: allow nullptr to work as default stream like other backends
+    stream_view(::sycl::queue* p) : base_class(gt::backend::sycl::get_queue())
+    {
+      // Interop with e.g. Fortran code that initializes a queue not using
+      // gtensor interfaces
+      if (p != nullptr) {
+        // Note: SYCL uses reference semantics for queues, so a "copy" of the
+        // queue is really the same queue
+        this->stream_ = *p;
+      }
+    }
     stream_view(const stream_view& other) : base_class(other.stream_) {}
 
     stream_view& operator=(const stream_view& other)
     {
-      this->~stream_view();
-      new (this) stream_view(other);
+      // Note: SYCL uses reference semantics for queues, so a "copy" of the
+      // queue is really the same queue
+      this->stream_ = other.stream_;
       return *this;
     }
 

From 513c353f88008ba6219b3375aeeb2c07f7dfcb2e Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Fri, 15 Dec 2023 10:28:15 -0500
Subject: [PATCH 6/8] fortran: add missing gpuStreamCreateAsync for sycl

Just a copy of gpuStreamCreate, to get this working in GENE
---
 src/fortran/gpu_api.cxx | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/fortran/gpu_api.cxx b/src/fortran/gpu_api.cxx
index 5b284250..26c63496 100644
--- a/src/fortran/gpu_api.cxx
+++ b/src/fortran/gpu_api.cxx
@@ -195,6 +195,14 @@ extern "C" int gpuStreamCreate(sycl::queue** pStream)
   return 0;
 }
 
+extern "C" int gpuStreamCreateAsync(sycl::queue** pStream)
+{
+  // TODO: do we need to change queue properties for this
+  // to be as close as possible to HIP/CUDA?
+  *pStream = &gt::backend::sycl::new_stream_queue();
+  return 0;
+}
+
 extern "C" int gpuStreamDestroy(sycl::queue* stream)
 {
   if (stream != nullptr) {

From 87d2a6a31b5934861230ed2afa2491941543afa2 Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Fri, 15 Dec 2023 12:39:52 -0500
Subject: [PATCH 7/8] sycl: fix memory type err handler

fixes compilation on some oneapi versions
---
 include/gtensor/backend_sycl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/gtensor/backend_sycl.h b/include/gtensor/backend_sycl.h
index c904ddac..17de8524 100644
--- a/include/gtensor/backend_sycl.h
+++ b/include/gtensor/backend_sycl.h
@@ -268,7 +268,8 @@ class backend_ops<gt::space::sycl>
       case ::sycl::usm::alloc::shared: return memory_type::managed;
       case ::sycl::usm::alloc::unknown: return memory_type::unregistered;
       default:
-        fprintf(stderr, "ERROR: unknown memoryType %d.\n", alloc_type);
+        std::cerr << "ERROR: unknown memory type for pointer " << ptr
+                  << std::endl;
         std::abort();
     }
   }

From 7619e2bbd1e88da13b5a5dc5435191da53d09721 Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Wed, 20 Dec 2023 11:24:36 -0500
Subject: [PATCH 8/8] benchmarks: comment out large fft benchmark

---
 benchmarks/bench_fft.cxx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx
index c08c77ef..23ea9514 100644
--- a/benchmarks/bench_fft.cxx
+++ b/benchmarks/bench_fft.cxx
@@ -237,8 +237,9 @@ BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 500, gt::space::managed>)
 BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 500, gt::space::managed>)
   ->Unit(benchmark::kMillisecond);
 
-BENCHMARK(BM_fft_r2c_2d<double, 256, 45, 2, gt::space::managed>)
-  ->Unit(benchmark::kMillisecond);
+// Uncomment to benchmark bigger case on large HPC class GPUs
+// BENCHMARK(BM_fft_r2c_2d<double, 256, 45, 2, gt::space::managed>)
+// ->Unit(benchmark::kMillisecond);
 
 template <typename E, int Nx, int Ny, int batch_k,
           typename S = gt::space::device>