diff --git a/benchmarks/bench_fft.cxx b/benchmarks/bench_fft.cxx
index b645cd92..23ea9514 100644
--- a/benchmarks/bench_fft.cxx
+++ b/benchmarks/bench_fft.cxx
@@ -164,4 +164,156 @@ BENCHMARK(BM_fft_c2r_1d<float, 64, 500, gt::space::managed>)
 BENCHMARK(BM_fft_c2r_1d<double, 64, 500, gt::space::managed>)
   ->Unit(benchmark::kMillisecond);
 
+// ======================================================================
+// BM_fft_r2c_2d
+//
+
+template <typename E, int Nx, int Ny, int batch_k,
+          typename S = gt::space::device>
+static void BM_fft_r2c_2d(benchmark::State& state)
+{
+  int batch_size = 1024 * batch_k;
+  using T = gt::complex<E>;
+
+  auto h_A = gt::zeros<E>({Nx, Ny, batch_size});
+  gt::bm::gtensor2<E, 3, S> d_A(h_A.shape());
+
+  auto h_B = gt::empty<T>({Nx / 2 + 1, Ny, batch_size});
+  gt::bm::gtensor2<T, 3, S> d_B(h_B.shape());
+
+  // Set up periodic domain with frequencies 4 and 2
+  // m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16]
+  double x, y;
+  for (int j = 0; j < Ny; j++) {
+    for (int i = 0; i < Nx; i++) {
+      x = -2.0 + i / 16.0;
+      y = j / 16.0;
+      h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y);
+    }
+  }
+
+  gt::copy(h_A, d_A);
+
+  gt::fft::FFTPlanMany<gt::fft::Domain::REAL, E> plan({Nx, Ny}, batch_size);
+  std::size_t work_bytes = plan.get_work_buffer_bytes();
+  std::cout << "plan work bytes: " << work_bytes << std::endl;
+
+  auto fn = [&]() {
+    plan(d_A, d_B);
+    gt::synchronize();
+  };
+
+  // warm up, device compile
+  fn();
+
+  for (auto _ : state) {
+    fn();
+  }
+}
+
+BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 500>)->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 16, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 16, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<float, 32, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_r2c_2d<double, 32, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+
+// Uncomment to benchmark bigger case on large HPC class GPUs
+// BENCHMARK(BM_fft_r2c_2d<double, 256, 45, 2, gt::space::managed>)
+// ->Unit(benchmark::kMillisecond);
+
+template <typename E, int Nx, int Ny, int batch_k,
+          typename S = gt::space::device>
+static void BM_fft_c2r_2d(benchmark::State& state)
+{
+  int batch_size = 1024 * batch_k;
+  using T = gt::complex<E>;
+
+  auto h_A = gt::zeros<E>({Nx, Ny, batch_size});
+  gt::bm::gtensor2<E, 3, S> d_A(h_A.shape());
+
+  auto h_A2 = gt::zeros<E>(h_A.shape());
+  gt::bm::gtensor2<E, 3, S> d_A2(h_A.shape());
+
+  auto h_B = gt::empty<T>({Nx / 2 + 1, Ny, batch_size});
+  auto h_B_expected = gt::empty<T>(h_B.shape());
+  gt::bm::gtensor2<T, 3, S> d_B(h_B.shape());
+
+  // Set up periodic domain with frequencies 4 and 2
+  // m = [sin(2*pi*x+4*pi*y) for x in -2:1/16:2-1/16, y in 0:1/16:1-1/16]
+  double x, y;
+  for (int j = 0; j < Ny; j++) {
+    for (int i = 0; i < Nx; i++) {
+      x = -2.0 + i / 16.0;
+      y = j / 16.0;
+      h_A(i, j, 0) = sin(2 * PI * x + 4 * PI * y);
+    }
+  }
+
+  gt::copy(h_A, d_A);
+
+  gt::fft::FFTPlanMany<gt::fft::Domain::REAL, E> plan({Nx, Ny}, batch_size);
+  std::size_t work_bytes = plan.get_work_buffer_bytes();
+  std::cout << "plan work bytes: " << work_bytes << std::endl;
+
+  plan(d_A, d_B);
+
+  auto fn = [&]() {
+    plan.inverse(d_B, d_A);
+    gt::synchronize();
+  };
+
+  // warm up, device compile
+  fn();
+
+  for (auto _ : state) {
+    fn();
+  }
+}
+
+BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 200>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 500>)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 500>)->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 200, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 16, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 16, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<float, 32, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_fft_c2r_2d<double, 32, 16, 500, gt::space::managed>)
+  ->Unit(benchmark::kMillisecond);
+
 BENCHMARK_MAIN();
diff --git a/include/gt-fft/backend/sycl.h b/include/gt-fft/backend/sycl.h
index ece251cc..138e9f50 100644
--- a/include/gt-fft/backend/sycl.h
+++ b/include/gt-fft/backend/sycl.h
@@ -270,7 +270,11 @@ class FFTPlanManySYCL
         work_buffer_bytes_ += plan_work_buffer_bytes;
       }
     } catch (std::exception const& e) {
-      std::cerr << "Error creating dft descriptor:" << e.what() << std::endl;
+      std::cerr << "Error creating dft descriptor size {" << lengths[0];
+      for (int i = 1; i < lengths.size(); i++) {
+        std::cerr << ", " << lengths[i];
+      }
+      std::cerr << "} batch " << batch_size << ": " << e.what() << std::endl;
       abort();
     }
   }
diff --git a/include/gtensor/backend_sycl.h b/include/gtensor/backend_sycl.h
index f23a8827..17de8524 100644
--- a/include/gtensor/backend_sycl.h
+++ b/include/gtensor/backend_sycl.h
@@ -268,7 +268,8 @@ class backend_ops<gt::space::sycl>
       case ::sycl::usm::alloc::shared: return memory_type::managed;
       case ::sycl::usm::alloc::unknown: return memory_type::unregistered;
       default:
-        fprintf(stderr, "ERROR: unknown memoryType %d.\n", alloc_type);
+        std::cerr << "ERROR: unknown memory type for pointer " << ptr
+                  << std::endl;
         std::abort();
     }
   }
@@ -304,12 +305,24 @@ class backend_ops<gt::space::sycl>
     using base_class::base_class;
 
     stream_view() : base_class(gt::backend::sycl::get_queue()) {}
+    // HACK: allow nullptr to work as default stream like other backends
+    stream_view(::sycl::queue* p) : base_class(gt::backend::sycl::get_queue())
+    {
+      // Interop with e.g. Fortran code that initializes a queue not using
+      // gtensor interfaces
+      if (p != nullptr) {
+        // Note: SYCL uses reference semantics for queues, so a "copy" of the
+        // queue is really the same queue
+        this->stream_ = *p;
+      }
+    }
     stream_view(const stream_view& other) : base_class(other.stream_) {}
 
     stream_view& operator=(const stream_view& other)
     {
-      this->~stream_view();
-      new (this) stream_view(other);
+      // Note: SYCL uses reference semantics for queues, so a "copy" of the
+      // queue is really the same queue
+      this->stream_ = other.stream_;
       return *this;
     }
 
diff --git a/src/fortran/gpu_api.cxx b/src/fortran/gpu_api.cxx
index 5b284250..26c63496 100644
--- a/src/fortran/gpu_api.cxx
+++ b/src/fortran/gpu_api.cxx
@@ -195,6 +195,14 @@ extern "C" int gpuStreamCreate(sycl::queue** pStream)
   return 0;
 }
 
+extern "C" int gpuStreamCreateAsync(sycl::queue** pStream)
+{
+  // TODO: do we need to change queue properties for this
+  // to be as close as possible to HIP/CUDA?
+  *pStream = &gt::backend::sycl::new_stream_queue();
+  return 0;
+}
+
 extern "C" int gpuStreamDestroy(sycl::queue* stream)
 {
   if (stream != nullptr) {