From 2e986b3497458b6531d948a80fde39cade86ea09 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Sat, 28 Mar 2026 16:51:22 +0800
Subject: [PATCH 1/8] fix

---
 test/cpp/compat/ATen_TensorAccessor_test.cc | 2 +-
 test/cpp/compat/c10_storage_test.cc         | 9 +++++----
 test/cpp/compat/compat_basic_test.cc        | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/test/cpp/compat/ATen_TensorAccessor_test.cc b/test/cpp/compat/ATen_TensorAccessor_test.cc
index cb1eaaf3c8add0..4d6b8e9648bcfa 100644
--- a/test/cpp/compat/ATen_TensorAccessor_test.cc
+++ b/test/cpp/compat/ATen_TensorAccessor_test.cc
@@ -198,7 +198,7 @@ TEST(TensorAccessorTest, PackedAccessorWithIntType) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(TensorAccessorTest, PackedAccessorCUDA) {
-  if (at::cuda::is_available()) {
+  if (torch::cuda::is_available()) {
     // Create CUDA tensor
     at::Tensor tensor =
         at::arange(12, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA))
diff --git a/test/cpp/compat/c10_storage_test.cc b/test/cpp/compat/c10_storage_test.cc
index f05083fe88747f..d94dee0d47b605 100644
--- a/test/cpp/compat/c10_storage_test.cc
+++ b/test/cpp/compat/c10_storage_test.cc
@@ -26,8 +26,9 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
 // Forward-declare getCUDADeviceAllocator to avoid include-order conflicts
-// between ATen/cuda/CUDAContextLight.h (defines at::cuda::is_available inline)
-// and torch/cuda.h (adds `using torch::cuda::is_available` to at::cuda).
+// between ATen/cuda/CUDAContextLight.h (defines torch::cuda::is_available
+// inline) and torch/cuda.h (adds `using torch::cuda::is_available` to
+// at::cuda).
 namespace at::cuda {
 c10::Allocator* getCUDADeviceAllocator();
 }  // namespace at::cuda
@@ -242,7 +243,7 @@ TEST(StorageTest, DeviceAndDeviceTypeAPIs) {
   ASSERT_EQ(place.GetType(), phi::AllocationType::CPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (at::cuda::is_available()) {
+  if (torch::cuda::is_available()) {
     at::TensorBase cuda_tensor = at::ones(
         {2, 3}, c10::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
     const c10::Storage& cuda_storage = cuda_tensor.storage();
@@ -1041,7 +1042,7 @@ TEST(StorageTest, ReferenceSemanticsSetNbytesVisibleThroughCopy) {
 TEST(StorageTest, CUDAAllocatorZeroBytePreservesDevice) {
   // getCUDADeviceAllocator()->allocate(0) must return a DataPtr whose device
   // is the current CUDA device, not a default-constructed CPU DataPtr.
-  if (!at::cuda::is_available()) {
+  if (!torch::cuda::is_available()) {
     return;  // No CUDA device, skip
   }
 
diff --git a/test/cpp/compat/compat_basic_test.cc b/test/cpp/compat/compat_basic_test.cc
index d6f33869449003..232a9fd66e8f7c 100644
--- a/test/cpp/compat/compat_basic_test.cc
+++ b/test/cpp/compat/compat_basic_test.cc
@@ -231,7 +231,7 @@ TEST(compat_basic_test, BasicCase) {
 TEST(TestDevice, DeviceAPIsOnCUDA) {
   // Test device related APIs on CUDA if available
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (at::cuda::is_available()) {
+  if (torch::cuda::is_available()) {
     at::TensorBase cuda_tensor = at::ones(
         {2, 3}, c10::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
 

From 177c6dcd3cbce820e4001869c77e96ca2ea4b427 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Sat, 28 Mar 2026 20:22:18 +0800
Subject: [PATCH 2/8] align some APIs

---
 .../phi/api/include/compat/ATen/core/ivalue.h | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/paddle/phi/api/include/compat/ATen/core/ivalue.h b/paddle/phi/api/include/compat/ATen/core/ivalue.h
index d22e19c4cdc1c1..f1151e2dcf537d 100644
--- a/paddle/phi/api/include/compat/ATen/core/ivalue.h
+++ b/paddle/phi/api/include/compat/ATen/core/ivalue.h
@@ -224,6 +224,16 @@ class IValue {
   bool is_custom_class() const { return tag_ == TypeTag::CustomClass; }
   bool is_tuple() const { return tag_ == TypeTag::Tuple; }
 
+  bool isNone() const { return is_none(); }
+  bool isBool() const { return is_bool(); }
+  bool isInt() const { return is_int(); }
+  bool isDouble() const { return is_double(); }
+  bool isString() const { return is_string(); }
+  bool isList() const { return is_list(); }
+  bool isTensor() const { return is_tensor(); }
+  bool isCustomClass() const { return is_custom_class(); }
+  bool isTuple() const { return is_tuple(); }
+
   bool to_bool() const {
     if (!is_bool()) throw std::runtime_error("Not a bool");
     return std::get<bool>(value_);
@@ -280,6 +290,39 @@ class IValue {
     return static_cast<at::ScalarType>(std::get<int64_t>(value_));
   }
 
+  bool toBool() const { return to_bool(); }
+  int64_t toInt() const { return to_int(); }
+  double toDouble() const { return to_double(); }
+  const std::string& toStringRef() const { return to_string(); }
+  std::string_view toStringView() const { return to_string_view(); }
+  at::Tensor toTensor() const { return to_tensor(); }
+  at::ScalarType toScalarType() const { return to_scalar_type(); }
+
+  std::string tagKind() const {
+    switch (tag_) {
+      case TypeTag::None:
+        return "None";
+      case TypeTag::Bool:
+        return "Bool";
+      case TypeTag::Int:
+        return "Int";
+      case TypeTag::Double:
+        return "Double";
+      case TypeTag::String:
+        return "String";
+      case TypeTag::Tensor:
+        return "Tensor";
+      case TypeTag::GenericList:
+        return "GenericList";
+      case TypeTag::CustomClass:
+        return "CustomClass";
+      case TypeTag::Tuple:
+        return "Tuple";
+      default:
+        return "InvalidTag";
+    }
+  }
+
   template <typename T>
   intrusive_ptr<T> to_custom_class() const {
     if (!is_custom_class()) throw std::runtime_error("Not a custom class");
@@ -637,3 +680,7 @@ intrusive_ptr<T> generic_to(const IValue& ivalue,
 }
 
 }  // namespace torch
+
+namespace c10 {
+using IValue = ::torch::IValue;
+}

From 84aa0c03174abc836bdc25aa9bdb17f4aa3456c3 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Sat, 28 Mar 2026 20:36:20 +0800
Subject: [PATCH 3/8] add align tests

---
 test/cpp/compat/c10_layout_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/cpp/compat/c10_layout_test.cc b/test/cpp/compat/c10_layout_test.cc
index 6e163aa8ee4c5a..1a0babeb7d3673 100644
--- a/test/cpp/compat/c10_layout_test.cc
+++ b/test/cpp/compat/c10_layout_test.cc
@@ -368,6 +368,9 @@ TEST(SparseConstructorTest, SparseCooTensorInferSize) {
 
   ASSERT_TRUE(sparse.is_sparse());
   ASSERT_EQ(sparse.layout(), c10::kSparse);
+  ASSERT_EQ(sparse.dim(), 2);
+  ASSERT_EQ(sparse.size(0), 3);
+  ASSERT_EQ(sparse.size(1), 3);
 }
 
 TEST(SparseConstructorTest, SparseCooTensorDouble) {

From f3d4070b09964a4d5fc9ed192c5b51c879d84227 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Sat, 28 Mar 2026 21:21:30 +0800
Subject: [PATCH 4/8] align some APIs

---
 .../api/include/compat/c10/cuda/CUDAGuard.h   | 104 ++++++++++-----
 .../api/include/compat/c10/cuda/CUDAStream.h  | 121 +++++++++++-------
 2 files changed, 146 insertions(+), 79 deletions(-)

diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h b/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h
index ce819e69e64932..8f2193ef46ae0e 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h
@@ -19,16 +19,43 @@
 #pragma once
 
 #include <c10/core/Device.h>
+#include <c10/util/Exception.h>
+
 #include <optional>
+
 #include "paddle/phi/core/platform/cuda_device_guard.h"
 
 namespace c10::cuda {
+
+namespace detail {
+
+inline Device current_cuda_device() {
+  return Device(kCUDA, phi::backends::gpu::GetCurrentDeviceId());
+}
+
+inline Device normalize_cuda_device(Device device) {
+  TORCH_CHECK(device.is_cuda(), "Expected a CUDA device, but got ", device);
+  return Device(kCUDA, device.has_index() ? device.index() : 0);
+}
+
+}  // namespace detail
+
 struct CUDAGuard {
   explicit CUDAGuard() = delete;  // NOLINT
 
-  explicit CUDAGuard(DeviceIndex device_index) : guard_(device_index) {}
+  explicit CUDAGuard(DeviceIndex device_index)
+      : original_device_(detail::current_cuda_device()),
+        current_device_(original_device_),
+        guard_() {
+    set_index(device_index);
+  }
 
-  explicit CUDAGuard(Device device) : guard_(device._PD_GetInner()) {}
+  explicit CUDAGuard(Device device)
+      : original_device_(detail::current_cuda_device()),
+        current_device_(original_device_),
+        guard_() {
+    set_device(device);
+  }
 
   CUDAGuard(const CUDAGuard&) = delete;
   CUDAGuard& operator=(const CUDAGuard&) = delete;
@@ -37,79 +64,86 @@ struct CUDAGuard {
   CUDAGuard& operator=(CUDAGuard&& other) = delete;
   ~CUDAGuard() = default;
 
-  void set_device(Device device) { guard_.SetDevice(device._PD_GetInner()); }
+  void set_device(Device device) {
+    current_device_ = detail::normalize_cuda_device(device);
+    guard_.SetDevice(current_device_._PD_GetInner());
+  }
 
   void reset_device(Device device) { set_device(device); }
 
   void set_index(DeviceIndex device_index) {
+    current_device_ = Device(kCUDA, device_index);
     guard_.SetDeviceIndex(device_index);
   }
 
-  Device current_device() const {
-    return c10::Device(c10::kCUDA, phi::backends::gpu::GetCurrentDeviceId());
-  }
+  Device original_device() const { return original_device_; }
+
+  Device current_device() const { return current_device_; }
 
  private:
+  Device original_device_;
+  Device current_device_;
   paddle::platform::CUDADeviceGuard guard_;
 };
 
 struct OptionalCUDAGuard {
   OptionalCUDAGuard() = default;
 
-  explicit OptionalCUDAGuard(std::optional<Device> device_opt) : guard_() {
+  explicit OptionalCUDAGuard(std::optional<Device> device_opt) {
     if (device_opt.has_value()) {
-      guard_.emplace(device_opt.value()._PD_GetInner());
+      set_device(device_opt.value());
     }
   }
 
-  explicit OptionalCUDAGuard(std::optional<DeviceIndex> device_index_opt)
-      : guard_() {
+  explicit OptionalCUDAGuard(std::optional<DeviceIndex> device_index_opt) {
     if (device_index_opt.has_value()) {
-      guard_.emplace(device_index_opt.value());
+      set_index(device_index_opt.value());
     }
   }
 
-  // Copy is not allowed
   OptionalCUDAGuard(const OptionalCUDAGuard&) = delete;
   OptionalCUDAGuard& operator=(const OptionalCUDAGuard&) = delete;
 
   OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete;
-
   OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete;
   ~OptionalCUDAGuard() = default;
 
   void set_device(Device device) {
-    if (!guard_.has_value()) {
-      guard_.emplace(device._PD_GetInner());
-    } else {
-      guard_->SetDevice(device._PD_GetInner());
-    }
+    const Device normalized = detail::normalize_cuda_device(device);
+    init_if_needed();
+    guard_->SetDevice(normalized._PD_GetInner());
+    current_device_ = normalized;
   }
 
-  void reset_device(Device device) {
-    if (!guard_.has_value()) {
-      guard_.emplace(device._PD_GetInner());
-    } else {
-      guard_->SetDevice(device._PD_GetInner());
-    }
-  }
+  void reset_device(Device device) { set_device(device); }
 
   void set_index(DeviceIndex device_index) {
-    if (!guard_.has_value()) {
-      guard_.emplace(device_index);
-    } else {
-      guard_->SetDeviceIndex(device_index);
-    }
+    init_if_needed();
+    guard_->SetDeviceIndex(device_index);
+    current_device_ = Device(kCUDA, device_index);
   }
 
-  std::optional<Device> current_device() const {
-    return guard_.has_value()
-               ? std::make_optional(c10::Device(
-                     c10::kCUDA, phi::backends::gpu::GetCurrentDeviceId()))
-               : std::nullopt;
+  std::optional<Device> original_device() const { return original_device_; }
+
+  std::optional<Device> current_device() const { return current_device_; }
+
+  void reset() {
+    guard_.reset();
+    original_device_.reset();
+    current_device_.reset();
   }
 
  private:
+  void init_if_needed() {
+    if (!guard_.has_value()) {
+      original_device_ = detail::current_cuda_device();
+      current_device_ = original_device_;
+      guard_.emplace();
+    }
+  }
+
+  std::optional<Device> original_device_;
+  std::optional<Device> current_device_;
   std::optional<paddle::platform::CUDADeviceGuard> guard_;
 };
 
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
index f88dd043317252..f0f8f23ee1b711 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
@@ -18,9 +18,14 @@
 #include <c10/core/Stream.h>
 #include <c10/cuda/CUDAException.h>
 
+#include <algorithm>
 #include <array>
 #include <atomic>
+#include <functional>
 #include <mutex>
+#include <ostream>
+#include <tuple>
+
 #include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/place.h"
@@ -30,16 +35,13 @@ namespace c10::cuda {
 
 using StreamId = int64_t;
 
-// ── Per-device stream pool and per-thread current stream ─────────────────────
+static constexpr int max_compile_time_stream_priorities = 4;
 
 namespace detail {
 
 constexpr int kStreamsPerPool = 32;
-// Upper bound for static pool/TLS arrays. 64 covers all current CUDA hardware.
 constexpr int kMaxDevices = 64;
 
-// Device count is invariant after CUDA initialization; cache it to avoid
-// repeated driver calls on the hot path.
 inline int gpu_device_count() {
   static const int count = phi::backends::gpu::GetGPUDeviceCount();
   return count;
@@ -83,9 +85,6 @@ inline void init_pool(int device_index, StreamPoolState* state) {
   }
 }
 
-// Per-thread, per-device current stream state.
-// thread_local inside an inline function is ODR-safe across translation units
-// (C++11 §3.2): all TUs share the same thread-local instance per thread.
 struct TLSStreamState {
   cudaStream_t streams[kMaxDevices]{};
   bool has_stream[kMaxDevices]{};
@@ -98,22 +97,24 @@ inline TLSStreamState& get_tls() {
 
 }  // namespace detail
 
-// ── CUDAStream ───────────────────────────────────────────────────────────────
-
 class CUDAStream {
  public:
+  enum Unchecked { UNCHECKED };
+
   CUDAStream() = delete;
 
   explicit CUDAStream(Stream stream) : stream_(stream) {
     TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
   }
 
+  explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {}
+
   bool operator==(const CUDAStream& other) const noexcept {
-    return stream_ == other.stream_;
+    return unwrap() == other.unwrap();
   }
 
   bool operator!=(const CUDAStream& other) const noexcept {
-    return stream_ != other.stream_;
+    return unwrap() != other.unwrap();
   }
 
   StreamId id() const { return stream_.id(); }
@@ -122,6 +123,21 @@ class CUDAStream {
 
   operator Stream() const { return unwrap(); }
 
+  bool query() const { return unwrap().query(); }
+
+  void synchronize() const { unwrap().synchronize(); }
+
+  int priority() const {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    phi::backends::gpu::GPUDeviceGuard guard(device_index());
+    int priority = 0;
+    C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
+    return priority;
+#else
+    return 0;
+#endif
+  }
+
   cudaStream_t stream() const {
     return reinterpret_cast<cudaStream_t>(stream_.id());
   }
@@ -134,17 +150,34 @@ class CUDAStream {
 
   Device device() const { return Device(DeviceType::CUDA, device_index()); }
 
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // stream()
-  cudaStream_t raw_stream() const { return stream(); }
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  static CUDAStream unpack3(StreamId stream_id,
+                            DeviceIndex device_index,
+                            DeviceType device_type) {
+    return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  static std::tuple<int, int> priority_range() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    int least_priority = 0;
+    int greatest_priority = 0;
+    C10_CUDA_CHECK(
+        cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+    greatest_priority =
+        std::max(-max_compile_time_stream_priorities + 1, greatest_priority);
+    return std::make_tuple(least_priority, greatest_priority);
+#else
+    return std::make_tuple(0, 0);
+#endif
+  }
 
  private:
   Stream stream_;
 };
 
-// Build a CUDAStream from a raw platform stream handle and a device index.
-// The handle is encoded as a StreamId via reinterpret_cast, matching Paddle's
-// phi::Stream / phi::CUDAStream convention.
 inline CUDAStream make_cuda_stream(cudaStream_t raw,
                                    c10::DeviceIndex device_index) {
   c10::StreamId sid =
@@ -155,14 +188,6 @@ inline CUDAStream make_cuda_stream(cudaStream_t raw,
                   sid));
 }
 
-/**
- * Get the current CUDA stream for the given device (or the current device if
- * device_index == -1).
- *
- * Returns the per-thread current stream if one has been set via
- * setCurrentCUDAStream() for this thread and device; otherwise falls back to
- * Paddle's default stream for the device.
- */
 inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) {
   if (device_index == -1) {
     device_index = phi::backends::gpu::GetCurrentDeviceId();
@@ -181,29 +206,18 @@ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) {
   return make_cuda_stream(raw, device_index);
 }
 
-/**
- * Get a stream from the per-device pool in round-robin fashion.
- * Returns a high priority stream if isHighPriority is true.
- *
- * The pool is lazily initialized on first use for each device.  Each device
- * has kStreamsPerPool low-priority and kStreamsPerPool high-priority streams
- * that are reused round-robin.  Pool streams are always distinct from the
- * current stream, enabling cross-stream dependency management and correct
- * record_stream lifetime semantics.
- */
-inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
+inline CUDAStream getStreamFromPool(const int priority,
                                     c10::DeviceIndex device_index = -1) {
   if (device_index == -1) {
     device_index = phi::backends::gpu::GetCurrentDeviceId();
   }
-  // get_pool also performs bounds-checking on device_index.
   auto& state = detail::get_pool(device_index);
   std::call_once(state.init_flag, [device_index, &state]() {
     detail::init_pool(device_index, &state);
   });
 
   cudaStream_t raw;
-  if (isHighPriority) {
+  if (priority < 0) {
     raw = state.high_priority[state.hp_counter.fetch_add(1) %
                               detail::kStreamsPerPool];
   } else {
@@ -221,6 +235,17 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
  * local to the calling OS thread and does not affect any shared state such as
  * Paddle's GPUContext.  Other threads continue to see their own current stream.
  */
+inline CUDAStream getStreamFromPool(const bool isHighPriority,
+                                    c10::DeviceIndex device_index) {
+  return getStreamFromPool(isHighPriority ? -1 : 0, device_index);
+}
+
+inline CUDAStream getStreamFromExternal(cudaStream_t ext_stream,
+                                        c10::DeviceIndex device_index) {
+  detail::check_device_index(device_index);
+  return make_cuda_stream(ext_stream, device_index);
+}
+
 inline void setCurrentCUDAStream(CUDAStream stream) {
   c10::DeviceIndex idx = stream.unwrap().device_index();
   detail::check_device_index(idx);
@@ -234,22 +259,30 @@ inline CUDAStream getDefaultCUDAStream(c10::DeviceIndex device_index = -1) {
     device_index = phi::backends::gpu::GetCurrentDeviceId();
   }
   detail::check_device_index(device_index);
-  // The default CUDA stream is always the null stream (cudaStreamDefault,
-  // handle == 0), regardless of any per-thread current stream override.
-  // This matches PyTorch semantics where getDefaultCUDAStream() returns the
-  // fixed device-level default stream, while getCurrentCUDAStream() returns
-  // the per-thread current stream (which may differ after
-  // setCurrentCUDAStream).
   return CUDAStream(c10::Stream(
       c10::Stream::DEFAULT, c10::Device(c10::DeviceType::CUDA, device_index)));
 }
 
+inline std::ostream& operator<<(std::ostream& stream, const CUDAStream& s) {
+  return stream << s.unwrap();
+}
+
 }  // namespace c10::cuda
 
+namespace std {
+template <>
+struct hash<c10::cuda::CUDAStream> {
+  size_t operator()(c10::cuda::CUDAStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+}  // namespace std
+
 namespace at::cuda {
 using c10::cuda::CUDAStream;
 using c10::cuda::getCurrentCUDAStream;
 using c10::cuda::getDefaultCUDAStream;
+using c10::cuda::getStreamFromExternal;
 using c10::cuda::getStreamFromPool;
 using c10::cuda::setCurrentCUDAStream;
 }  // namespace at::cuda

From 1d0432550c6508ddd97c50f56bf0144fa6095234 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Sun, 29 Mar 2026 12:46:39 +0800
Subject: [PATCH 5/8] align Allocator

---
 .../api/include/compat/c10/core/Allocator.h   | 101 +++++++++++++++++-
 1 file changed, 98 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/api/include/compat/c10/core/Allocator.h b/paddle/phi/api/include/compat/c10/core/Allocator.h
index a35070936031ad..14a9f2f7dd57d1 100644
--- a/paddle/phi/api/include/compat/c10/core/Allocator.h
+++ b/paddle/phi/api/include/compat/c10/core/Allocator.h
@@ -23,7 +23,9 @@
 #include <c10/util/Exception.h>
 #include <c10/util/UniqueVoidPtr.h>
 
+#include <array>
 #include <cstddef>
+#include <cstdint>
 #include <cstring>
 #include <functional>
 #include <memory>
@@ -37,6 +39,15 @@ namespace c10 {
 // Deleter function pointer type (compatible with LibTorch)
 using DeleterFnPtr = void (*)(void*);
 
+using CaptureId_t = uint64_t;
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
 // DataPtr class compatible with LibTorch's c10::DataPtr
 // Wraps a pointer with associated device and deleter
 class DataPtr {
@@ -63,6 +74,7 @@ class DataPtr {
 
   void clear() { ptr_.clear(); }
   void* get() const { return ptr_.get(); }
+  void* mutable_get() { return ptr_.get(); }
   void* get_context() const { return ptr_.get_context(); }
   void* release_context() { return ptr_.release_context(); }
 
@@ -128,15 +140,14 @@ struct Allocator {
   // Requires: input data was allocated by the same allocator.
   DataPtr clone(const void* data, std::size_t n) {
     auto new_data = allocate(n);
-    copy_data(new_data.get(), data, n);
+    copy_data(new_data.mutable_get(), data, n);
     return new_data;
   }
 
   // Checks if DataPtr has a simple context, not wrapped with any out of the
   // ordinary contexts.
   virtual bool is_simple_data_ptr(const DataPtr& data_ptr) const {
-    return data_ptr.get_context() == nullptr ||
-           data_ptr.get_context() == data_ptr.get();
+    return data_ptr.get() == data_ptr.get_context();
   }
 
   // If this returns a non nullptr, it means that allocate()
@@ -176,6 +187,90 @@ struct Allocator {
   }
 };
 
+struct InefficientStdFunctionContext {
+  void* ptr_{nullptr};
+  std::function<void(void*)> deleter_;
+
+  InefficientStdFunctionContext(void* ptr, std::function<void(void*)> deleter)
+      : ptr_(ptr), deleter_(std::move(deleter)) {}
+
+  InefficientStdFunctionContext(const InefficientStdFunctionContext&) = delete;
+
+  InefficientStdFunctionContext(InefficientStdFunctionContext&& rhs) noexcept
+      : ptr_(std::exchange(rhs.ptr_, nullptr)),
+        deleter_(std::move(rhs.deleter_)) {}
+
+  InefficientStdFunctionContext& operator=(
+      const InefficientStdFunctionContext&) = delete;
+
+  InefficientStdFunctionContext& operator=(
+      InefficientStdFunctionContext&& rhs) {
+    this->~InefficientStdFunctionContext();
+    ptr_ = std::exchange(rhs.ptr_, nullptr);
+    deleter_ = std::move(rhs.deleter_);
+    return *this;
+  }
+
+  ~InefficientStdFunctionContext() {
+    if (deleter_) {
+      deleter_(ptr_);
+    }
+  }
+
+  static DataPtr makeDataPtr(void* ptr,
+                             std::function<void(void*)> deleter,
+                             Device device) {
+    return DataPtr(ptr,
+                   new InefficientStdFunctionContext(ptr, std::move(deleter)),
+                   &deleteContext,
+                   device);
+  }
+
+ private:
+  static void deleteContext(void* ptr) {
+    delete static_cast<InefficientStdFunctionContext*>(ptr);
+  }
+};
+
+inline constexpr size_t kAllocatorRegistrySize =
+    static_cast<size_t>(DeviceType::CUSTOM) + 1;
+
+inline std::array<Allocator*, kAllocatorRegistrySize> g_allocator_array{};
+inline std::array<uint8_t, kAllocatorRegistrySize> g_allocator_priority{};
+
+inline size_t allocator_device_index(DeviceType t) {
+  const size_t index = static_cast<size_t>(t);
+  TORCH_CHECK(index < kAllocatorRegistrySize,
+              "Allocator device type index out of range: ",
+              index);
+  return index;
+}
+
+inline void SetAllocator(DeviceType t, Allocator* alloc, uint8_t priority = 0) {
+  const size_t index = allocator_device_index(t);
+  if (priority >= g_allocator_priority[index]) {
+    g_allocator_array[index] = alloc;
+    g_allocator_priority[index] = priority;
+  }
+}
+
+inline Allocator* GetAllocator(const DeviceType& t) {
+  const size_t index = allocator_device_index(t);
+  auto* alloc = g_allocator_array[index];
+  TORCH_CHECK(alloc != nullptr, "Allocator for ", t, " is not set.");
+  return alloc;
+}
+
+template <DeviceType t>
+struct AllocatorRegisterer {
+  explicit AllocatorRegisterer(Allocator* alloc) { SetAllocator(t, alloc); }
+};
+
+#define REGISTER_ALLOCATOR(t, f)                       \
+  namespace {                                          \
+  static c10::AllocatorRegisterer<t> g_allocator_d(f); \
+  }
+
 }  // namespace c10
 
 namespace at {

From 1ea7eea57ef70b37ea6f48b9cdc0b316c786b4b5 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Sun, 29 Mar 2026 21:33:23 +0800
Subject: [PATCH 6/8] add version.h, fix some cuda functions and storage test

---
 paddle/phi/api/include/compat/CMakeLists.txt  |  1 +
 .../include/compat/c10/cuda/CUDAFunctions.cpp | 44 +++++++++++++++++++
 .../include/compat/c10/cuda/CUDAFunctions.h   | 25 +----------
 .../torch/csrc/api/include/torch/version.h    | 39 ++++++++++++++++
 test/cpp/compat/c10_storage_test.cc           |  8 +++-
 5 files changed, 92 insertions(+), 25 deletions(-)
 create mode 100644 paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp
 create mode 100644 paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h

diff --git a/paddle/phi/api/include/compat/CMakeLists.txt b/paddle/phi/api/include/compat/CMakeLists.txt
index 3a3c939b44cba7..3f0c635261a0d8 100644
--- a/paddle/phi/api/include/compat/CMakeLists.txt
+++ b/paddle/phi/api/include/compat/CMakeLists.txt
@@ -1,5 +1,6 @@
 collect_srcs(api_srcs SRCS c10/core/Device.cpp)
 collect_srcs(api_srcs SRCS c10/core/Stream.cpp)
+collect_srcs(api_srcs SRCS c10/cuda/CUDAFunctions.cpp)
 collect_srcs(api_srcs SRCS c10/util/typeid.cpp)
 collect_srcs(api_srcs SRCS ATen/cuda/EmptyTensor.cpp)
 collect_srcs(api_srcs SRCS ATen/cuda/CUDAContextLight.cpp)
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp
new file mode 100644
index 00000000000000..482bf7036d7cea
--- /dev/null
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp
@@ -0,0 +1,44 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <c10/cuda/CUDAFunctions.h>
+
+namespace c10::cuda {
+
+c10::DeviceIndex device_count() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  return phi::backends::gpu::GetGPUDeviceCount();
+#else
+  // Return 0 instead of throwing to match PyTorch API semantics
+  // at::cuda::is_available() relies on this returning 0/false
+  return 0;
+#endif
+}
+
+void device_synchronize() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  int curr_device_id = paddle::platform::GetCurrentDeviceId();
+  paddle::platform::SetDeviceId(curr_device_id);
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#endif
+#else
+  PADDLE_THROW(common::errors::Unavailable(
+      "Paddle is not compiled with CUDA. Cannot visit device synchronize."));
+#endif
+}
+
+}  // namespace c10::cuda
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
index 086c3f486cc6ae..bcc097cadd033d 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
@@ -18,30 +18,9 @@
 
 namespace c10::cuda {
 
-inline c10::DeviceIndex device_count() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  return phi::backends::gpu::GetGPUDeviceCount();
-#else
-  // Return 0 instead of throwing to match PyTorch API semantics
-  // at::cuda::is_available() relies on this returning 0/false
-  return 0;
-#endif
-}
+c10::DeviceIndex device_count();
 
-inline void device_synchronize() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  int curr_device_id = paddle::platform::GetCurrentDeviceId();
-  paddle::platform::SetDeviceId(curr_device_id);
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#endif
-#else
-  PADDLE_THROW(common::errors::Unavailable(
-      "Paddle is not compiled with CUDA. Cannot visit device synchronize."));
-#endif
-}
+void device_synchronize();
 
 void __inline__ stream_synchronize(gpuStream_t stream) {
   phi::backends::gpu::GpuStreamSync(stream);
diff --git a/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h
new file mode 100644
index 00000000000000..f745837944df17
--- /dev/null
+++ b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+/// Indicates the major version of LibTorch.
+#define TORCH_VERSION_MAJOR 2
+
+/// Indicates the minor version of LibTorch.
+#define TORCH_VERSION_MINOR 10
+
+/// Indicates the patch version of LibTorch.
+#define TORCH_VERSION_PATCH 0
+
+/// Indicates the ABI version tag of LibTorch.
+#define TORCH_VERSION_ABI_TAG 0
+
+/// Indicates the version of LibTorch as a string literal.
+#define TORCH_VERSION "2.10.0"
+
+/// Indicates the ABI version of LibTorch as a single uint64.
+/// [ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ]
+/// [ MAJ  ][ MIN  ][ PATCH][                              ABI TAG ]
+#define TORCH_ABI_VERSION                                                 \
+  (((0ULL + TORCH_VERSION_MAJOR) << 56) |                                 \
+   ((0ULL + TORCH_VERSION_MINOR) << 48) | /* NOLINT(whitespace/indent) */ \
+   ((0ULL + TORCH_VERSION_PATCH) << 40) | /* NOLINT(whitespace/indent) */ \
+   ((0ULL + TORCH_VERSION_ABI_TAG) << 0)) /* NOLINT(whitespace/indent) */
diff --git a/test/cpp/compat/c10_storage_test.cc b/test/cpp/compat/c10_storage_test.cc
index d94dee0d47b605..947786fb929870 100644
--- a/test/cpp/compat/c10_storage_test.cc
+++ b/test/cpp/compat/c10_storage_test.cc
@@ -732,9 +732,13 @@ TEST(StorageTest, DataPtrHelpersAndAllocatorSimpleDataPtrChecks) {
   dp.unsafe_set_device(c10::Device(c10::DeviceType::CPU));
   ASSERT_EQ(dp.device().type(), c10::DeviceType::CPU);
 
-  // is_simple_data_ptr: context==nullptr branch.
+  // PyTorch only treats context==data as a simple DataPtr.
   RawCompatibleAllocator compatible_alloc;
-  ASSERT_TRUE(compatible_alloc.is_simple_data_ptr(dp));
+  ASSERT_FALSE(compatible_alloc.is_simple_data_ptr(dp));
+
+  // is_simple_data_ptr: context==data branch.
+  c10::DataPtr simple = compatible_alloc.allocate(4);
+  ASSERT_TRUE(compatible_alloc.is_simple_data_ptr(simple));
 
   // is_simple_data_ptr: context!=data branch.
   c10::DataPtr non_simple = RawIncompatibleAllocator().allocate(4);

From fcb126aaf77e6877cd6c2a6fd7fb29ffab473673 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Mon, 30 Mar 2026 14:12:59 +0800
Subject: [PATCH 7/8] try to fix Windows build

---
 paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
index bcc097cadd033d..25a0301783b606 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
@@ -22,12 +22,17 @@ c10::DeviceIndex device_count();
 
 void device_synchronize();
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void __inline__ stream_synchronize(gpuStream_t stream) {
   phi::backends::gpu::GpuStreamSync(stream);
 }
+#endif
+
 }  // namespace c10::cuda
 
 namespace at::cuda {
 using c10::cuda::device_synchronize;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 using c10::cuda::stream_synchronize;
+#endif
 }  // namespace at::cuda

From 53cb19c603b182add90dcbbe7d258dcc54a69ab9 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Thu, 2 Apr 2026 17:08:25 +0800
Subject: [PATCH 8/8] revert raw_stream() and fix getStreamFromPool

---
 .../api/include/compat/c10/cuda/CUDAStream.h  | 22 +++++++++++++------
 test/cpp/compat/c10_Stream_test.cc            | 20 +++++++++++++++++
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
index f0f8f23ee1b711..2ae22655f8423b 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
@@ -150,6 +150,8 @@ class CUDAStream {
 
   Device device() const { return Device(DeviceType::CUDA, device_index()); }
 
+  cudaStream_t raw_stream() const { return stream(); }
+
   struct c10::StreamData3 pack3() const {
     return stream_.pack3();
   }
@@ -228,15 +230,13 @@ inline CUDAStream getStreamFromPool(const int priority,
 }
 
 /**
- * Set the current CUDA stream for the device of the given stream in the
- * calling thread.
+ * Get a new stream from the CUDA stream pool.
  *
- * Implements per-thread, per-device current stream semantics: the change is
- * local to the calling OS thread and does not affect any shared state such as
- * Paddle's GPUContext.  Other threads continue to see their own current stream.
+ * This overload matches PyTorch's bool-based entry point and preserves the
+ * single-argument form `getStreamFromPool(true)` for high-priority requests.
  */
-inline CUDAStream getStreamFromPool(const bool isHighPriority,
-                                    c10::DeviceIndex device_index) {
+inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
+                                    c10::DeviceIndex device_index = -1) {
   return getStreamFromPool(isHighPriority ? -1 : 0, device_index);
 }
 
@@ -246,6 +246,14 @@ inline CUDAStream getStreamFromExternal(cudaStream_t ext_stream,
   return make_cuda_stream(ext_stream, device_index);
 }
 
+/**
+ * Set the current CUDA stream for the device of the given stream in the
+ * calling thread.
+ *
+ * Implements per-thread, per-device current stream semantics: the change is
+ * local to the calling OS thread and does not affect any shared state such as
+ * Paddle's GPUContext.  Other threads continue to see their own current stream.
+ */
 inline void setCurrentCUDAStream(CUDAStream stream) {
   c10::DeviceIndex idx = stream.unwrap().device_index();
   detail::check_device_index(idx);
diff --git a/test/cpp/compat/c10_Stream_test.cc b/test/cpp/compat/c10_Stream_test.cc
index 7ee6a4c5f682d4..d41421c10abe53 100644
--- a/test/cpp/compat/c10_Stream_test.cc
+++ b/test/cpp/compat/c10_Stream_test.cc
@@ -112,6 +112,26 @@ TEST(CUDAStreamTest, DefaultStreamIsStable) {
   EXPECT_EQ(s1, s2);
 }
 
+TEST(CUDAStreamTest, GetStreamFromPoolBoolOverloadPreservesHighPriority) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto low_priority_stream =
+      c10::cuda::getStreamFromPool(/*isHighPriority=*/false);
+  auto high_priority_stream =
+      c10::cuda::getStreamFromPool(/*isHighPriority=*/true);
+  auto explicit_high_priority_stream = c10::cuda::getStreamFromPool(-1);
+
+  const int low_priority = low_priority_stream.priority();
+  const int high_priority = high_priority_stream.priority();
+  const int explicit_high_priority = explicit_high_priority_stream.priority();
+
+  if (low_priority == explicit_high_priority) {
+    return;
+  }
+
+  EXPECT_EQ(high_priority, explicit_high_priority);
+  EXPECT_NE(high_priority, low_priority);
+}
+
 // After setCurrentCUDAStream redirects the per-thread current stream,
 // getDefaultCUDAStream must still return the null stream.
 TEST(CUDAStreamTest, DefaultStreamUnaffectedBySetCurrentCUDAStream) {