From a9e4a6f875e7fe282c4e21af97a72ad97500015d Mon Sep 17 00:00:00 2001
From: Bryce Allen <bdallen@uchicago.edu>
Date: Tue, 14 Mar 2023 10:05:40 -0400
Subject: [PATCH] fortran: add gpuAllocatorClearCache

- requires a hack to clear cache for most common allocator types
- does not work if thrust backend is enabled
---
 include/gtensor/allocator.h       |  2 ++
 include/gtensor/backend_common.h  |  7 +++--
 include/gtensor/gtensor.h         | 49 +++++++++++++++++++++++++++++++
 src/fortran/gpu_api.cxx           |  2 ++
 src/fortran/gpu_api_interface.F90 |  3 ++
 5 files changed, 61 insertions(+), 2 deletions(-)
diff --git a/include/gtensor/allocator.h b/include/gtensor/allocator.h
index fe989a05..a02b8b8d 100644
--- a/include/gtensor/allocator.h
+++ b/include/gtensor/allocator.h
@@ -99,6 +99,8 @@ struct caching_allocator : A
 
   GT_INLINE void construct(pointer) {}
 
+  // Note: thrust allocators have a non-static deallocate and this does not work
+  // if thrust is enabled
   static void clear_cache()
   {
     for (auto it = free_.begin(); it != free_.end(); it++) {
diff --git a/include/gtensor/backend_common.h b/include/gtensor/backend_common.h
index 85d39dc0..c44b82ff 100644
--- a/include/gtensor/backend_common.h
+++ b/include/gtensor/backend_common.h
@@ -267,8 +267,11 @@ struct wrap_allocator
   using pointer = gt::space_pointer<T, S>;
   using size_type = gt::size_type;
 
-  pointer allocate(size_type n) { return pointer(A::template allocate<T>(n)); }
-  void deallocate(pointer p, size_type n)
+  static pointer allocate(size_type n)
+  {
+    return pointer(A::template allocate<T>(n));
+  }
+  static void deallocate(pointer p, size_type n)
   {
     A::deallocate(gt::pointer_traits<pointer>::get(p));
   }
diff --git a/include/gtensor/gtensor.h b/include/gtensor/gtensor.h
index 58da5183..20c35306 100644
--- a/include/gtensor/gtensor.h
+++ b/include/gtensor/gtensor.h
@@ -942,6 +942,55 @@ decltype(auto) host_mirror(E& e)
   return detail::host_mirror<E>::run(e);
 }
 
+// ======================================================================
+// allocator_clear_caches
+
+template <typename T, typename S = gt::space::device>
+using caching_device_allocator =
+  gt::allocator::caching_allocator<T, device_allocator<T, S>>;
+
+template <typename T, typename S = gt::space::host>
+using caching_host_allocator =
+  gt::allocator::caching_allocator<T, host_allocator<T, S>>;
+
+template <typename T, typename S = gt::space::managed>
+using caching_managed_allocator =
+  gt::allocator::caching_allocator<T, managed_allocator<T, S>>;
+
+// TODO: this is a hack, we should re-write caching_allocator to better
+// support this use case, i.e. clearing cache after initialization
+// and auto-parallelization but before main time loop in an app like
+// GENE
+inline void allocator_clear_caches()
+{
+  // Note: thrust allocators have a non-static deallocate
+#if defined(GTENSOR_HAVE_DEVICE) && !defined(GTENSOR_USE_THRUST)
+  gt::caching_device_allocator<double>::clear_cache();
+  gt::caching_device_allocator<float>::clear_cache();
+  gt::caching_device_allocator<gt::complex<double>>::clear_cache();
+  gt::caching_device_allocator<gt::complex<float>>::clear_cache();
+  gt::caching_device_allocator<uint8_t>::clear_cache();
+  gt::caching_device_allocator<int>::clear_cache();
+  gt::caching_device_allocator<std::size_t>::clear_cache();
+
+  gt::caching_managed_allocator<double>::clear_cache();
+  gt::caching_managed_allocator<float>::clear_cache();
+  gt::caching_managed_allocator<gt::complex<double>>::clear_cache();
+  gt::caching_managed_allocator<gt::complex<float>>::clear_cache();
+  gt::caching_managed_allocator<uint8_t>::clear_cache();
+  gt::caching_managed_allocator<int>::clear_cache();
+  gt::caching_managed_allocator<std::size_t>::clear_cache();
+
+  gt::caching_host_allocator<double>::clear_cache();
+  gt::caching_host_allocator<float>::clear_cache();
+  gt::caching_host_allocator<gt::complex<double>>::clear_cache();
+  gt::caching_host_allocator<gt::complex<float>>::clear_cache();
+  gt::caching_host_allocator<uint8_t>::clear_cache();
+  gt::caching_host_allocator<int>::clear_cache();
+  gt::caching_host_allocator<std::size_t>::clear_cache();
+#endif
+}
+
 } // namespace gt
 
 #endif
diff --git a/src/fortran/gpu_api.cxx b/src/fortran/gpu_api.cxx
index 5f9eddd0..f2bc74d4 100644
--- a/src/fortran/gpu_api.cxx
+++ b/src/fortran/gpu_api.cxx
@@ -196,3 +196,5 @@ extern "C" int gpuMemcpyAsync(void* dst, const void* src, size_t bytes,
 }
 
 #endif
+
+extern "C" void gpuAllocatorClearCache() { gt::allocator_clear_caches(); }
diff --git a/src/fortran/gpu_api_interface.F90 b/src/fortran/gpu_api_interface.F90
index fa5141e0..a4780226 100644
--- a/src/fortran/gpu_api_interface.F90
+++ b/src/fortran/gpu_api_interface.F90
@@ -284,4 +284,7 @@ subroutine gpuDeviceGet(out_device_id)
       out_device_id = gt_backend_device_get()
    end subroutine gpuDeviceGet
 
+   subroutine gpuAllocatorClearCache()
+   end subroutine gpuAllocatorClearCache
+
 end module gpu_api_m