diff --git a/.gitignore b/.gitignore
index b2a73328821657..abd75c85cc55ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,7 @@ third_party/
 *~
 bazel-*
 .humanize
+.codex
 
 build_*
 # clion workspace.
diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
index a12dfba80b5bec..6445befda15ee5 100644
--- a/paddle/phi/api/include/compat/ATen/core/TensorBody.h
+++ b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
@@ -687,12 +687,6 @@ class Tensor : public TensorBase {
   }
 
   void record_stream(at::Stream s) const;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void record_stream(at::cuda::CUDAStream s) const;
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // at::Stream
-  void record_stream(cudaStream_t s) const;
-#endif
 
   Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); }
 
diff --git a/paddle/phi/api/include/compat/ATen/core/ivalue.h b/paddle/phi/api/include/compat/ATen/core/ivalue.h
index d22e19c4cdc1c1..f1151e2dcf537d 100644
--- a/paddle/phi/api/include/compat/ATen/core/ivalue.h
+++ b/paddle/phi/api/include/compat/ATen/core/ivalue.h
@@ -224,6 +224,16 @@ class IValue {
   bool is_custom_class() const { return tag_ == TypeTag::CustomClass; }
   bool is_tuple() const { return tag_ == TypeTag::Tuple; }
 
+  bool isNone() const { return is_none(); }
+  bool isBool() const { return is_bool(); }
+  bool isInt() const { return is_int(); }
+  bool isDouble() const { return is_double(); }
+  bool isString() const { return is_string(); }
+  bool isList() const { return is_list(); }
+  bool isTensor() const { return is_tensor(); }
+  bool isCustomClass() const { return is_custom_class(); }
+  bool isTuple() const { return is_tuple(); }
+
   bool to_bool() const {
     if (!is_bool()) throw std::runtime_error("Not a bool");
     return std::get<bool>(value_);
@@ -280,6 +290,39 @@ class IValue {
     return static_cast<at::ScalarType>(std::get<int64_t>(value_));
   }
 
+  bool toBool() const { return to_bool(); }
+  int64_t toInt() const { return to_int(); }
+  double toDouble() const { return to_double(); }
+  const std::string& toStringRef() const { return to_string(); }
+  std::string_view toStringView() const { return to_string_view(); }
+  at::Tensor toTensor() const { return to_tensor(); }
+  at::ScalarType toScalarType() const { return to_scalar_type(); }
+
+  std::string tagKind() const {
+    switch (tag_) {
+      case TypeTag::None:
+        return "None";
+      case TypeTag::Bool:
+        return "Bool";
+      case TypeTag::Int:
+        return "Int";
+      case TypeTag::Double:
+        return "Double";
+      case TypeTag::String:
+        return "String";
+      case TypeTag::Tensor:
+        return "Tensor";
+      case TypeTag::GenericList:
+        return "GenericList";
+      case TypeTag::CustomClass:
+        return "CustomClass";
+      case TypeTag::Tuple:
+        return "Tuple";
+      default:
+        return "InvalidTag";
+    }
+  }
+
   template <typename T>
   intrusive_ptr<T> to_custom_class() const {
     if (!is_custom_class()) throw std::runtime_error("Not a custom class");
@@ -637,3 +680,7 @@ intrusive_ptr<T> generic_to(const IValue& ivalue,
 }
 
 }  // namespace torch
+
+namespace c10 {
+using IValue = ::torch::IValue;
+}
diff --git a/paddle/phi/api/include/compat/ATen/ops/arange.h b/paddle/phi/api/include/compat/ATen/ops/arange.h
index 54f08137119c45..b0e05d4e9ed56d 100644
--- a/paddle/phi/api/include/compat/ATen/ops/arange.h
+++ b/paddle/phi/api/include/compat/ATen/ops/arange.h
@@ -25,50 +25,48 @@
 
 namespace at {
 
-inline at::Tensor arange(const at::Scalar& end,
-                         at::TensorOptions options = {}) {
-  if (options.pinned_memory()) {
-    // Pinning memory is only supported for CPU tensors
-    if (options.has_device() && !options.device().is_cpu()) {
-      PD_THROW(
-          "pin_memory=true requires device to be CPU, but got non-CPU device");
-    }
-    phi::Place base_place = options._PD_GetPlace();
-    phi::Place pinned_place = compat::_PD_GetCreatePinnedPlace(base_place);
-    auto dense = paddle::experimental::arange(
-        paddle::experimental::full({}, 0, phi::DataType::FLOAT64),
-        paddle::experimental::full(
-            {}, end.to<double>(), phi::DataType::FLOAT64),
-        paddle::experimental::full({}, 1, phi::DataType::FLOAT64),
-        compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()),
-        phi::CPUPlace());
-    return dense.copy_to(pinned_place, /*blocking=*/true);
+namespace detail {
+
+inline bool _PD_IsIntegralArangeScalar(const at::Scalar& scalar) {
+  switch (scalar.dtype()) {
+    case phi::DataType::BOOL:
+    case phi::DataType::UINT8:
+    case phi::DataType::INT8:
+    case phi::DataType::UINT16:
+    case phi::DataType::INT16:
+    case phi::DataType::UINT32:
+    case phi::DataType::INT32:
+    case phi::DataType::UINT64:
+    case phi::DataType::INT64:
+      return true;
+    default:
+      return false;
   }
-  return paddle::experimental::arange(
-      paddle::experimental::full({}, 0, phi::DataType::FLOAT64),
-      paddle::experimental::full({}, end.to<double>(), phi::DataType::FLOAT64),
-      paddle::experimental::full({}, 1, phi::DataType::FLOAT64),
-      compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()),
-      options._PD_GetPlace());
 }
 
-inline at::Tensor arange(const at::Scalar& end,
-                         ::std::optional<at::ScalarType> dtype,
-                         ::std::optional<at::Layout> layout,
-                         ::std::optional<at::Device> device,
-                         ::std::optional<bool> pin_memory) {
-  auto options =
-      at::TensorOptions()
-          .dtype(dtype.value_or(c10::get_default_dtype_as_scalartype()))
-          .layout(layout)
-          .device(device.value_or(at::kCPU))
-          .pinned_memory(pin_memory);
-  return arange(end, options);
+inline at::ScalarType _PD_ResolveArangeDtype(const at::Scalar& start,
+                                             const at::Scalar& end,
+                                             const at::Scalar& step,
+                                             const at::TensorOptions& options) {
+  if (options.has_dtype()) {
+    return options.dtype().toScalarType();
+  }
+  if (_PD_IsIntegralArangeScalar(start) && _PD_IsIntegralArangeScalar(end) &&
+      _PD_IsIntegralArangeScalar(step)) {
+    return at::kLong;
+  }
+  return c10::get_default_dtype_as_scalartype();
 }
 
+}  // namespace detail
+
 inline at::Tensor arange(const at::Scalar& start,
                          const at::Scalar& end,
+                         const at::Scalar& step,
                          at::TensorOptions options = {}) {
+  // Match PyTorch: step must be non-zero and consistent with (end - start).
+  at::native::arange_check_bounds(start, end, step);
+  auto dtype = detail::_PD_ResolveArangeDtype(start, end, step, options);
   if (options.pinned_memory()) {
     // Pinning memory is only supported for CPU tensors
     if (options.has_device() && !options.device().is_cpu()) {
@@ -82,8 +80,9 @@ inline at::Tensor arange(const at::Scalar& start,
             {}, start.to<double>(), phi::DataType::FLOAT64),
         paddle::experimental::full(
             {}, end.to<double>(), phi::DataType::FLOAT64),
-        paddle::experimental::full({}, 1, phi::DataType::FLOAT64),
-        compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()),
+        paddle::experimental::full(
+            {}, step.to<double>(), phi::DataType::FLOAT64),
+        compat::_PD_AtenScalarTypeToPhiDataType(dtype),
         phi::CPUPlace());
     return dense.copy_to(pinned_place, /*blocking=*/true);
   }
@@ -91,58 +90,47 @@ inline at::Tensor arange(const at::Scalar& start,
       paddle::experimental::full(
           {}, start.to<double>(), phi::DataType::FLOAT64),
       paddle::experimental::full({}, end.to<double>(), phi::DataType::FLOAT64),
-      paddle::experimental::full({}, 1, phi::DataType::FLOAT64),
-      compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()),
+      paddle::experimental::full({}, step.to<double>(), phi::DataType::FLOAT64),
+      compat::_PD_AtenScalarTypeToPhiDataType(dtype),
       options._PD_GetPlace());
 }
 
-inline at::Tensor arange(const at::Scalar& start,
-                         const at::Scalar& end,
+inline at::Tensor arange(const at::Scalar& end,
+                         at::TensorOptions options = {}) {
+  return arange(/*start=*/0, end, /*step=*/1, options);
+}
+
+inline at::Tensor arange(const at::Scalar& end,
                          ::std::optional<at::ScalarType> dtype,
                          ::std::optional<at::Layout> layout,
                          ::std::optional<at::Device> device,
                          ::std::optional<bool> pin_memory) {
-  auto options =
-      at::TensorOptions()
-          .dtype(dtype.value_or(c10::get_default_dtype_as_scalartype()))
-          .layout(layout)
-          .device(device.value_or(at::kCPU))
-          .pinned_memory(pin_memory);
-  return arange(start, end, options);
+  auto options = at::TensorOptions()
+                     .dtype(dtype)
+                     .layout(layout)
+                     .device(device)
+                     .pinned_memory(pin_memory);
+  return arange(/*start=*/0, end, /*step=*/1, options);
 }
 
 inline at::Tensor arange(const at::Scalar& start,
                          const at::Scalar& end,
-                         const at::Scalar& step,
                          at::TensorOptions options = {}) {
-  // Match PyTorch: step must be non-zero and consistent with (end - start).
-  at::native::arange_check_bounds(start, end, step);
-  if (options.pinned_memory()) {
-    // Pinning memory is only supported for CPU tensors
-    if (options.has_device() && !options.device().is_cpu()) {
-      PD_THROW(
-          "pin_memory=true requires device to be CPU, but got non-CPU device");
-    }
-    phi::Place base_place = options._PD_GetPlace();
-    phi::Place pinned_place = compat::_PD_GetCreatePinnedPlace(base_place);
-    auto dense = paddle::experimental::arange(
-        paddle::experimental::full(
-            {}, start.to<double>(), phi::DataType::FLOAT64),
-        paddle::experimental::full(
-            {}, end.to<double>(), phi::DataType::FLOAT64),
-        paddle::experimental::full(
-            {}, step.to<double>(), phi::DataType::FLOAT64),
-        compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()),
-        phi::CPUPlace());
-    return dense.copy_to(pinned_place, /*blocking=*/true);
-  }
-  return paddle::experimental::arange(
-      paddle::experimental::full(
-          {}, start.to<double>(), phi::DataType::FLOAT64),
-      paddle::experimental::full({}, end.to<double>(), phi::DataType::FLOAT64),
-      paddle::experimental::full({}, step.to<double>(), phi::DataType::FLOAT64),
-      compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()),
-      options._PD_GetPlace());
+  return arange(start, end, /*step=*/1, options);
+}
+
+inline at::Tensor arange(const at::Scalar& start,
+                         const at::Scalar& end,
+                         ::std::optional<at::ScalarType> dtype,
+                         ::std::optional<at::Layout> layout,
+                         ::std::optional<at::Device> device,
+                         ::std::optional<bool> pin_memory) {
+  auto options = at::TensorOptions()
+                     .dtype(dtype)
+                     .layout(layout)
+                     .device(device)
+                     .pinned_memory(pin_memory);
+  return arange(start, end, /*step=*/1, options);
 }
 
 inline at::Tensor arange(const at::Scalar& start,
@@ -152,12 +140,11 @@ inline at::Tensor arange(const at::Scalar& start,
                          ::std::optional<at::Layout> layout,
                          ::std::optional<at::Device> device,
                          ::std::optional<bool> pin_memory) {
-  auto options =
-      at::TensorOptions()
-          .dtype(dtype.value_or(c10::get_default_dtype_as_scalartype()))
-          .layout(layout)
-          .device(device.value_or(at::kCPU))
-          .pinned_memory(pin_memory);
+  auto options = at::TensorOptions()
+                     .dtype(dtype)
+                     .layout(layout)
+                     .device(device)
+                     .pinned_memory(pin_memory);
   return arange(start, end, step, options);
 }
 
diff --git a/paddle/phi/api/include/compat/ATen/ops/equal.h b/paddle/phi/api/include/compat/ATen/ops/equal.h
index 4619144f8f5aba..1ac49d9e245d74 100644
--- a/paddle/phi/api/include/compat/ATen/ops/equal.h
+++ b/paddle/phi/api/include/compat/ATen/ops/equal.h
@@ -22,6 +22,12 @@
 namespace at {
 
 inline bool equal(const at::Tensor& self, const at::Tensor& other) {
+  PD_CHECK(self.defined(),
+           "Expected a proper Tensor but got None (or an undefined Tensor in "
+           "C++)");
+  PD_CHECK(other.defined(),
+           "Expected a proper Tensor but got None (or an undefined Tensor in "
+           "C++)");
   PD_CHECK(self.device() == other.device(),
            "Cannot compare two tensors on "
            "different devices. Got: ",
diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
index 73cb5dd4b2247c..ff43391cdc796c 100644
--- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h
+++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
@@ -49,22 +49,4 @@ inline void Tensor::record_stream(at::Stream s) const {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-inline void Tensor::record_stream(at::cuda::CUDAStream s) const {
-  record_stream(static_cast<at::Stream>(s));
-}
-
-// TODO(youge325): Remove after DeepEP paddle branch is updated to use
-// at::Stream
-inline void Tensor::record_stream(cudaStream_t s) const {
-  auto dense_tensor =
-      std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
-  PD_CHECK(dense_tensor != nullptr,
-           "record_stream only supports DenseTensor, but got a non-dense "
-           "tensor implementation.");
-  PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
-           "record_stream is not supported for CPU tensors.");
-  paddle::memory::RecordStream(dense_tensor->Holder(), s);
-}
-#endif
 }  // namespace at
diff --git a/paddle/phi/api/include/compat/ATen/ops/resize.h b/paddle/phi/api/include/compat/ATen/ops/resize.h
index 44232e54ff9353..ee4807027d0b31 100644
--- a/paddle/phi/api/include/compat/ATen/ops/resize.h
+++ b/paddle/phi/api/include/compat/ATen/ops/resize.h
@@ -23,13 +23,31 @@
 
 namespace at {
 
-// resize_ - in-place resize using reshape
+// resize_ - use reshape for same-numel cases and set_ for storage-changing
+// cases so repeated resize_ calls stay stable.
 inline const at::Tensor& Tensor::resize_(
     at::IntArrayRef size,
     ::std::optional<at::MemoryFormat> memory_format) const {
-  auto result =
-      paddle::experimental::reshape(tensor_, size._PD_ToPaddleIntArray());
-  const_cast<Tensor*>(this)->tensor_ = result;
+  if (memory_format.has_value()) {
+    TORCH_CHECK(*memory_format == at::MemoryFormat::Contiguous,
+                "resize_ only supports contiguous memory format, but got ",
+                static_cast<int>(*memory_format));
+  }
+
+  std::vector<int64_t> dims(size.begin(), size.end());
+  int64_t new_numel = 1;
+  for (auto dim : dims) {
+    new_numel *= dim;
+  }
+
+  if (tensor_.numel() == new_numel) {
+    const_cast<Tensor*>(this)->tensor_ =
+        paddle::experimental::reshape(tensor_, phi::IntArray(dims));
+    return *this;
+  }
+
+  auto source = tensor_.copy_to(tensor_.place(), /*blocking=*/true);
+  paddle::experimental::set_(const_cast<Tensor*>(this)->tensor_, source, dims);
   return *this;
 }
 
diff --git a/paddle/phi/api/include/compat/ATen/ops/select.h b/paddle/phi/api/include/compat/ATen/ops/select.h
index 8c859da44349b6..6c522db600add2 100644
--- a/paddle/phi/api/include/compat/ATen/ops/select.h
+++ b/paddle/phi/api/include/compat/ATen/ops/select.h
@@ -19,13 +19,35 @@
 namespace at {
 
 inline at::Tensor select(const at::Tensor& self, int64_t dim, int64_t index) {
+  // Normalize dim to positive value for error messages
+  int64_t orig_dim = dim;
   if (dim < 0) {
     dim += self.dim();
   }
-  // Handle negative indexing
+  // Check dim is valid
+  if (dim < 0 || dim >= self.dim()) {
+    PD_CHECK(false,
+             "select(): index ",
+             orig_dim,
+             " out of range for tensor of size ",
+             self.sizes(),
+             " at dimension ",
+             orig_dim);
+  }
+  // Handle negative index
+  int64_t orig_index = index;
   if (index < 0) {
-    int64_t dim_size = self.size(dim);
-    index = dim_size + index;
+    index = self.size(dim) + index;
+  }
+  // Check index is valid
+  if (index < 0 || index >= self.size(dim)) {
+    PD_CHECK(false,
+             "select(): index ",
+             orig_index,
+             " out of range for tensor of size ",
+             self.sizes(),
+             " at dimension ",
+             orig_dim < 0 ? orig_dim + self.dim() : orig_dim);
   }
 
   return Tensor(
diff --git a/paddle/phi/api/include/compat/ATen/ops/std.h b/paddle/phi/api/include/compat/ATen/ops/std.h
index b8600de2c857d9..ed7875f64431c1 100644
--- a/paddle/phi/api/include/compat/ATen/ops/std.h
+++ b/paddle/phi/api/include/compat/ATen/ops/std.h
@@ -32,6 +32,21 @@ inline Tensor std_impl(const Tensor& self,
                        const std::vector<int64_t>& dims_vec,
                        double correction_value,
                        bool keepdim) {
+  // Validate dimensions before processing
+  int64_t ndim = self.dim();
+  for (int64_t d : dims_vec) {
+    int64_t dim_idx = d < 0 ? d + ndim : d;
+    if (dim_idx < 0 || dim_idx >= ndim) {
+      PD_CHECK(false,
+               "Dimension out of range (expected to be in range of [",
+               -ndim,
+               ", ",
+               ndim - 1,
+               "], but got ",
+               d,
+               ")");
+    }
+  }
   phi::IntArray dims_int_array(dims_vec);
   paddle::Tensor tensor = self._PD_GetInner();
 
diff --git a/paddle/phi/api/include/compat/CMakeLists.txt b/paddle/phi/api/include/compat/CMakeLists.txt
index 3a3c939b44cba7..3f0c635261a0d8 100644
--- a/paddle/phi/api/include/compat/CMakeLists.txt
+++ b/paddle/phi/api/include/compat/CMakeLists.txt
@@ -1,5 +1,6 @@
 collect_srcs(api_srcs SRCS c10/core/Device.cpp)
 collect_srcs(api_srcs SRCS c10/core/Stream.cpp)
+collect_srcs(api_srcs SRCS c10/cuda/CUDAFunctions.cpp)
 collect_srcs(api_srcs SRCS c10/util/typeid.cpp)
 collect_srcs(api_srcs SRCS ATen/cuda/EmptyTensor.cpp)
 collect_srcs(api_srcs SRCS ATen/cuda/CUDAContextLight.cpp)
diff --git a/paddle/phi/api/include/compat/c10/core/Allocator.h b/paddle/phi/api/include/compat/c10/core/Allocator.h
index a35070936031ad..14a9f2f7dd57d1 100644
--- a/paddle/phi/api/include/compat/c10/core/Allocator.h
+++ b/paddle/phi/api/include/compat/c10/core/Allocator.h
@@ -23,7 +23,9 @@
 #include <c10/util/Exception.h>
 #include <c10/util/UniqueVoidPtr.h>
 
+#include <array>
 #include <cstddef>
+#include <cstdint>
 #include <cstring>
 #include <functional>
 #include <memory>
@@ -37,6 +39,15 @@ namespace c10 {
 // Deleter function pointer type (compatible with LibTorch)
 using DeleterFnPtr = void (*)(void*);
 
+using CaptureId_t = uint64_t;
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
 // DataPtr class compatible with LibTorch's c10::DataPtr
 // Wraps a pointer with associated device and deleter
 class DataPtr {
@@ -63,6 +74,7 @@ class DataPtr {
 
   void clear() { ptr_.clear(); }
   void* get() const { return ptr_.get(); }
+  void* mutable_get() { return ptr_.get(); }
   void* get_context() const { return ptr_.get_context(); }
   void* release_context() { return ptr_.release_context(); }
 
@@ -128,15 +140,14 @@ struct Allocator {
   // Requires: input data was allocated by the same allocator.
   DataPtr clone(const void* data, std::size_t n) {
     auto new_data = allocate(n);
-    copy_data(new_data.get(), data, n);
+    copy_data(new_data.mutable_get(), data, n);
     return new_data;
   }
 
   // Checks if DataPtr has a simple context, not wrapped with any out of the
   // ordinary contexts.
   virtual bool is_simple_data_ptr(const DataPtr& data_ptr) const {
-    return data_ptr.get_context() == nullptr ||
-           data_ptr.get_context() == data_ptr.get();
+    return data_ptr.get() == data_ptr.get_context();
   }
 
   // If this returns a non nullptr, it means that allocate()
@@ -176,6 +187,90 @@ struct Allocator {
   }
 };
 
+struct InefficientStdFunctionContext {
+  void* ptr_{nullptr};
+  std::function<void(void*)> deleter_;
+
+  InefficientStdFunctionContext(void* ptr, std::function<void(void*)> deleter)
+      : ptr_(ptr), deleter_(std::move(deleter)) {}
+
+  InefficientStdFunctionContext(const InefficientStdFunctionContext&) = delete;
+
+  InefficientStdFunctionContext(InefficientStdFunctionContext&& rhs) noexcept
+      : ptr_(std::exchange(rhs.ptr_, nullptr)),
+        deleter_(std::move(rhs.deleter_)) {}
+
+  InefficientStdFunctionContext& operator=(
+      const InefficientStdFunctionContext&) = delete;
+
+  InefficientStdFunctionContext& operator=(
+      InefficientStdFunctionContext&& rhs) {
+    this->~InefficientStdFunctionContext();
+    ptr_ = std::exchange(rhs.ptr_, nullptr);
+    deleter_ = std::move(rhs.deleter_);
+    return *this;
+  }
+
+  ~InefficientStdFunctionContext() {
+    if (deleter_) {
+      deleter_(ptr_);
+    }
+  }
+
+  static DataPtr makeDataPtr(void* ptr,
+                             std::function<void(void*)> deleter,
+                             Device device) {
+    return DataPtr(ptr,
+                   new InefficientStdFunctionContext(ptr, std::move(deleter)),
+                   &deleteContext,
+                   device);
+  }
+
+ private:
+  static void deleteContext(void* ptr) {
+    delete static_cast<InefficientStdFunctionContext*>(ptr);
+  }
+};
+
+inline constexpr size_t kAllocatorRegistrySize =
+    static_cast<size_t>(DeviceType::CUSTOM) + 1;
+
+inline std::array<Allocator*, kAllocatorRegistrySize> g_allocator_array{};
+inline std::array<uint8_t, kAllocatorRegistrySize> g_allocator_priority{};
+
+inline size_t allocator_device_index(DeviceType t) {
+  const size_t index = static_cast<size_t>(t);
+  TORCH_CHECK(index < kAllocatorRegistrySize,
+              "Allocator device type index out of range: ",
+              index);
+  return index;
+}
+
+inline void SetAllocator(DeviceType t, Allocator* alloc, uint8_t priority = 0) {
+  const size_t index = allocator_device_index(t);
+  if (priority >= g_allocator_priority[index]) {
+    g_allocator_array[index] = alloc;
+    g_allocator_priority[index] = priority;
+  }
+}
+
+inline Allocator* GetAllocator(const DeviceType& t) {
+  const size_t index = allocator_device_index(t);
+  auto* alloc = g_allocator_array[index];
+  TORCH_CHECK(alloc != nullptr, "Allocator for ", t, " is not set.");
+  return alloc;
+}
+
+template <DeviceType t>
+struct AllocatorRegisterer {
+  explicit AllocatorRegisterer(Allocator* alloc) { SetAllocator(t, alloc); }
+};
+
+#define REGISTER_ALLOCATOR(t, f)                       \
+  namespace {                                          \
+  static c10::AllocatorRegisterer<t> g_allocator_d(f); \
+  }
+
 }  // namespace c10
 
 namespace at {
diff --git a/paddle/phi/api/include/compat/c10/core/Device.cpp b/paddle/phi/api/include/compat/c10/core/Device.cpp
index 9a170f10a335be..db8e74c3feb24f 100644
--- a/paddle/phi/api/include/compat/c10/core/Device.cpp
+++ b/paddle/phi/api/include/compat/c10/core/Device.cpp
@@ -18,7 +18,13 @@
 
 #include <c10/core/Device.h>
 #include <c10/util/Exception.h>
+
+#include <algorithm>
 #include <array>
+#include <cctype>
+#include <exception>
+#include <string>
+
 #include "paddle/common/enforce.h"
 
 namespace c10 {
@@ -45,44 +51,98 @@ const char* DeviceTypeToString(DeviceType type) {
 
 DeviceType parse_type(const std::string& device_string) {
   static const std::array<std::pair<const char*, DeviceType>,
-                          static_cast<size_t>(4)>
+                          static_cast<size_t>(5)>
       types = {{
           {"cpu", DeviceType::CPU},
           {"cuda", DeviceType::CUDA},
           {"ipu", DeviceType::IPU},
           {"xpu", DeviceType::XPU},
+          {"privateuseone", DeviceType::PrivateUse1},
       }};
-  for (const auto& type_pair : types) {
-    if (device_string == type_pair.first) {
-      return type_pair.second;
-    }
+  auto device = std::find_if(
+      types.begin(),
+      types.end(),
+      [&device_string](const std::pair<const char*, DeviceType>& p) {
+        return p.first && p.first == device_string;
+      });
+  if (device != types.end()) {
+    return device->second;
   }
-  PADDLE_THROW(::common::errors::InvalidArgument(
-      "Unknown device type: '%s'. Supported device types are ",
-      "'cpu', 'cuda', 'ipu', and 'xpu'.",
-      device_string));
+  TORCH_CHECK(false,
+              "Expected one of cpu, cuda, ipu, xpu, privateuseone device type "
+              "at start of device string: ",
+              device_string);
 }
 
+enum DeviceStringParsingState { kStart, kIndexStart, kIndexRest, kError };
+
 Device::Device(const std::string& device_string) : Device(Type::CPU) {
   TORCH_CHECK(!device_string.empty(), "Device string must not be empty");
-  auto colon_pos = device_string.find(':');
-  std::string type_str = colon_pos == std::string::npos
-                             ? device_string
-                             : device_string.substr(0, colon_pos);
-  type_ = parse_type(type_str);
-  index_ = -1;
-  if (colon_pos != std::string::npos) {
-    std::string index_str = device_string.substr(colon_pos + 1);
-    try {
-      index_ = static_cast<DeviceIndex>(std::stoi(index_str));
-    } catch (const std::invalid_argument&) {
-      PADDLE_THROW(::common::errors::InvalidArgument(
-          "Invalid device index: '%s' is not a number.", index_str));
-    } catch (const std::out_of_range&) {
-      PADDLE_THROW(::common::errors::InvalidArgument(
-          "Invalid device index: '%s' is out of range.", index_str));
+
+  std::string device_name, device_index_str;
+  DeviceStringParsingState pstate = DeviceStringParsingState::kStart;
+
+  for (size_t i = 0;
+       pstate != DeviceStringParsingState::kError && i < device_string.size();
+       ++i) {
+    const char ch = device_string.at(i);
+    const unsigned char uch = static_cast<unsigned char>(ch);
+    switch (pstate) {
+      case DeviceStringParsingState::kStart:
+        if (ch != ':') {
+          if (std::isalpha(uch) || ch == '_') {
+            device_name.push_back(ch);
+          } else {
+            pstate = DeviceStringParsingState::kError;
+          }
+        } else {
+          pstate = DeviceStringParsingState::kIndexStart;
+        }
+        break;
+      case DeviceStringParsingState::kIndexStart:
+        if (std::isdigit(uch)) {
+          device_index_str.push_back(ch);
+          pstate = DeviceStringParsingState::kIndexRest;
+        } else {
+          pstate = DeviceStringParsingState::kError;
+        }
+        break;
+      case DeviceStringParsingState::kIndexRest:
+        if (device_index_str.at(0) == '0') {
+          pstate = DeviceStringParsingState::kError;
+          break;
+        }
+        if (std::isdigit(uch)) {
+          device_index_str.push_back(ch);
+        } else {
+          pstate = DeviceStringParsingState::kError;
+        }
+        break;
+      case DeviceStringParsingState::kError:
+        break;
+    }
+  }
+
+  const bool has_error = device_name.empty() ||
+                         pstate == DeviceStringParsingState::kError ||
+                         (pstate == DeviceStringParsingState::kIndexStart &&
+                          device_index_str.empty());
+  TORCH_CHECK(!has_error, "Invalid device string: '", device_string, "'");
+
+  try {
+    if (!device_index_str.empty()) {
+      index_ = static_cast<DeviceIndex>(std::stoi(device_index_str));
     }
+  } catch (const std::exception&) {
+    TORCH_CHECK(false,
+                "Could not parse device index '",
+                device_index_str,
+                "' in device string '",
+                device_string,
+                "'");
   }
+  type_ = parse_type(device_name);
+  validate();
 }
 
 std::string Device::str() const {
diff --git a/paddle/phi/api/include/compat/c10/core/Device.h b/paddle/phi/api/include/compat/c10/core/Device.h
index 5197d9a52790e9..e2d9166014577c 100644
--- a/paddle/phi/api/include/compat/c10/core/Device.h
+++ b/paddle/phi/api/include/compat/c10/core/Device.h
@@ -24,7 +24,12 @@ using gpuStream_t = hipStream_t;
 #endif
 
 #include <c10/core/DeviceType.h>
+#include <c10/util/Exception.h>
 
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iosfwd>
 #include <string>
 #include <utility>
 
@@ -42,13 +47,19 @@ struct Device final {
         index_(place.GetType() == phi::AllocationType::CPU
                    ? static_cast<DeviceIndex>(-1)
                    : place.GetDeviceId()),
-        custom_device_type_(place.GetDeviceType()) {}
+        custom_device_type_(place.GetDeviceType()) {
+    validate();
+  }
   Device(DeviceType type, DeviceIndex index = -1)
-      : type_(type), index_(index) {}  // NOLINT
+      : type_(type), index_(index) {  // NOLINT
+    validate();
+  }
   Device(DeviceType type, DeviceIndex index, std::string custom_device_type)
       : type_(type),
         index_(index),
-        custom_device_type_(std::move(custom_device_type)) {}  // NOLINT
+        custom_device_type_(std::move(custom_device_type)) {  // NOLINT
+    validate();
+  }
 
   /// Constructs a `Device` from a string description, for convenience.
   /// The string supplied must follow the following schema:
@@ -63,10 +74,51 @@ struct Device final {
 
   DeviceType type() const noexcept { return type_; }
 
+  bool operator!=(const Device& other) const noexcept {
+    return !(*this == other);
+  }
+
+  void set_index(DeviceIndex index) {
+    index_ = index;
+    validate();
+  }
+
   bool is_cuda() const noexcept { return type_ == DeviceType::CUDA; }
 
+  bool is_privateuseone() const noexcept {
+    return type_ == DeviceType::PrivateUse1;
+  }
+
+  bool is_mps() const noexcept { return false; }
+
+  bool is_hip() const noexcept { return false; }
+
+  bool is_ve() const noexcept { return false; }
+
+  bool is_xpu() const noexcept { return type_ == DeviceType::XPU; }
+
+  bool is_ipu() const noexcept { return type_ == DeviceType::IPU; }
+
+  bool is_xla() const noexcept { return false; }
+
+  bool is_mtia() const noexcept { return false; }
+
+  bool is_hpu() const noexcept { return false; }
+
+  bool is_lazy() const noexcept { return false; }
+
+  bool is_vulkan() const noexcept { return false; }
+
+  bool is_metal() const noexcept { return false; }
+
+  bool is_maia() const noexcept { return false; }
+
+  bool is_meta() const noexcept { return false; }
+
   bool is_cpu() const noexcept { return type_ == DeviceType::CPU; }
 
+  bool supports_as_strided() const noexcept { return type_ != DeviceType::IPU; }
+
   std::string str() const;
 
   bool operator==(const Device& other) const noexcept {
@@ -96,12 +148,37 @@ struct Device final {
   DeviceType type_{DeviceType::CPU};
   DeviceIndex index_{-1};
   std::string custom_device_type_;
+
+  void validate() {
+#ifndef NDEBUG
+    TORCH_INTERNAL_ASSERT(index_ >= -1,
+                          "Device index must be -1 or non-negative, got ",
+                          static_cast<int>(index_));
+    TORCH_INTERNAL_ASSERT(!is_cpu() || index_ <= 0,
+                          "CPU device index must be -1 or zero, got ",
+                          static_cast<int>(index_));
+#endif
+  }
 };
 
 std::ostream& operator<<(std::ostream& stream, const Device& device);
 
 }  // namespace c10
 
+namespace std {
+template <>
+struct hash<c10::Device> {
+  size_t operator()(c10::Device d) const noexcept {
+    static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit");
+    static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit");
+    uint32_t bits = static_cast<uint32_t>(static_cast<uint8_t>(d.type()))
+                        << 16 |
+                    static_cast<uint32_t>(static_cast<uint8_t>(d.index()));
+    return std::hash<uint32_t>{}(bits);
+  }
+};
+}  // namespace std
+
 namespace at {
 using c10::Device;
 using c10::DeviceIndex;
diff --git a/paddle/phi/api/include/compat/c10/core/DeviceType.h b/paddle/phi/api/include/compat/c10/core/DeviceType.h
index 9e6487dbadb061..a67b5d880e0b3e 100644
--- a/paddle/phi/api/include/compat/c10/core/DeviceType.h
+++ b/paddle/phi/api/include/compat/c10/core/DeviceType.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <cstdint>
+#include <functional>
 #include <ostream>
 
 #include "paddle/phi/common/place.h"
@@ -26,6 +28,7 @@ enum class DeviceType : int8_t {
   XPU = 12,
   IPU = 18,
   CUSTOM = 20,
+  PrivateUse1 = CUSTOM,
 };
 
 constexpr DeviceType kCUDA = DeviceType::CUDA;
@@ -33,6 +36,7 @@ constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUSTOM = DeviceType::CUSTOM;
 constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kIPU = DeviceType::IPU;
+constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1;
 
 inline phi::AllocationType DeviceTypeToPhi(DeviceType d) {
   switch (d) {
@@ -103,12 +107,22 @@ inline std::ostream& operator<<(std::ostream& os, DeviceType d) {
 
 }  // namespace c10
 
+namespace std {
+template <>
+struct hash<c10::DeviceType> {
+  std::size_t operator()(c10::DeviceType k) const noexcept {
+    return std::hash<int>()(static_cast<int>(k));
+  }
+};
+}  // namespace std
+
 namespace at {
 using c10::DeviceType;
 using c10::kCPU;
 using c10::kCUDA;
 using c10::kCUSTOM;
 using c10::kIPU;
+using c10::kPrivateUse1;
 using c10::kXPU;
 }  // namespace at
 
@@ -118,5 +132,6 @@ using c10::kCPU;
 using c10::kCUDA;
 using c10::kCUSTOM;
 using c10::kIPU;
+using c10::kPrivateUse1;
 using c10::kXPU;
 }  // namespace torch
diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
index 5d2c2d10b710d5..04ea3c7eaae0f1 100644
--- a/paddle/phi/api/include/compat/c10/core/Event.h
+++ b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -20,38 +20,6 @@
 #include <queue>
 namespace c10 {
 
-/**
- * A backend-generic movable, not copyable, not thread-safe event.
- *
- * The design of this event follows that of CUDA and HIP events. These events
- * are recorded and waited on by streams and can be rerecorded to,
- * each rerecording essentially creating a new version of the event.
- * For example, if (in CPU time), stream X is asked to record E,
- * stream Y waits on E, and stream X is asked to record E again, then Y will
- * wait for X to finish the first call to record and not the second, because
- * it's waiting on the first version of event E, not the second.
- * Querying an event only returns the status of its most recent version.
- *
- * Backend-generic events are implemented by this class and
- * impl::InlineEvent. In addition to these events there are also
- * some backend-specific events, like ATen's CUDAEvent. Each of these
- * classes has its own use.
- *
- * impl::InlineEvent<...> or a backend-specific event should be
- * preferred when the backend is known at compile time and known to
- * be compiled. Backend-specific events may have additional functionality.
- *
- * This Event should be used if a particular backend may not be available,
- * or the backend required is not known at compile time.
- *
- * These generic events are built on top of DeviceGuardImpls, analogous
- * to DeviceGuard and InlineDeviceGuard. The name "DeviceGuardImpls,"
- * is no longer entirely accurate, as these classes implement the
- * backend-specific logic for a generic backend interface.
- *
- * See DeviceGuardImplInterface.h for a list of all supported flags.
- */
-
 #ifdef PADDLE_WITH_CUDA
 
 class EventPool {
@@ -136,12 +104,6 @@ struct Event final {
 
   void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); }
 
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // c10::Stream
-  void record(const cudaStream_t &stream) {
-    C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream));
-  }
-
   void block(const Stream &stream) const {
     C10_CUDA_CHECK(cudaStreamWaitEvent(
         static_cast<cudaStream_t>(stream.native_handle()), cuda_event_, 0));
diff --git a/paddle/phi/api/include/compat/c10/core/ScalarType.h b/paddle/phi/api/include/compat/c10/core/ScalarType.h
index 5495e655040dcb..0fe99f5d1a0f04 100644
--- a/paddle/phi/api/include/compat/c10/core/ScalarType.h
+++ b/paddle/phi/api/include/compat/c10/core/ScalarType.h
@@ -130,58 +130,15 @@ struct dummy_int1_7_t {};
   _(uint32_t, UINT32, UInt32)
 
 enum class PADDLE_API ScalarType : int8_t {
-  Byte = 0,
-  Char = 1,
-  Short = 2,
-  Int = 3,
-  Long = 4,
-  Half = 5,
-  Float = 6,
-  Double = 7,
-  ComplexHalf = 8,
-  ComplexFloat = 9,
-  ComplexDouble = 10,
-  Bool = 11,
-  QInt8 = 12,
-  QUInt8 = 13,
-  QInt32 = 14,
-  BFloat16 = 15,
-  QUInt4x2 = 16,
-  QUInt2x4 = 17,
-  Bits1x8 = 18,
-  Bits2x4 = 19,
-  Bits4x2 = 20,
-  Bits8 = 21,
-  Bits16 = 22,
-  Float8_e5m2 = 23,
-  Float8_e4m3fn = 24,
-  Float8_e5m2fnuz = 25,
-  Float8_e4m3fnuz = 26,
-  UInt16 = 27,
-  UInt32 = 28,
-  UInt64 = 29,
-  UInt1 = 30,
-  UInt2 = 31,
-  UInt3 = 32,
-  UInt4 = 33,
-  UInt5 = 34,
-  UInt6 = 35,
-  UInt7 = 36,
-  Int1 = 37,
-  Int2 = 38,
-  Int3 = 39,
-  Int4 = 40,
-  Int5 = 41,
-  Int6 = 42,
-  Int7 = 43,
-  Float8_e8m0fnu = 44,
-  Float4_e2m1fn_x2 = 45,
-  Undefined = 46,
-  NumOptions = 47
+#define DEFINE_ST_ENUM_VAL_(_1, _2, n) n,
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ST_ENUM_VAL_)
+#undef DEFINE_ENUM_ST_ENUM_VAL_
+#define DEFINE_ST_ENUM_VAL_FOR_QINTS_(_1, n) n,
+      AT_FORALL_QINT_TYPES(DEFINE_ST_ENUM_VAL_FOR_QINTS_)
+#undef DEFINE_ST_ENUM_VAL_FOR_QINTS_
+          Undefined,
+  NumOptions
 };
-
-constexpr uint16_t NumScalarTypes =
-    static_cast<uint16_t>(ScalarType::NumOptions);
 namespace impl {
 
 // These are used to map ScalarTypes to C++ types.
@@ -281,38 +238,6 @@ inline const char* toString(ScalarType t) {
 
   switch (t) {
     AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
-    case ScalarType::QInt8:
-      return "QInt8";
-    case ScalarType::QUInt8:
-      return "QUInt8";
-    case ScalarType::QInt32:
-      return "QInt32";
-    case ScalarType::QUInt4x2:
-      return "QUInt4x2";
-    case ScalarType::QUInt2x4:
-      return "QUInt2x4";
-    case ScalarType::ComplexHalf:
-      return "ComplexHalf";
-    case ScalarType::Bits1x8:
-      return "Bits1x8";
-    case ScalarType::Bits2x4:
-      return "Bits2x4";
-    case ScalarType::Bits4x2:
-      return "Bits4x2";
-    case ScalarType::Bits8:
-      return "Bits8";
-    case ScalarType::Bits16:
-      return "Bits16";
-    case ScalarType::Float8_e5m2fnuz:
-      return "Float8_e5m2fnuz";
-    case ScalarType::Float8_e4m3fnuz:
-      return "Float8_e4m3fnuz";
-    case ScalarType::Float8_e8m0fnu:
-      return "Float8_e8m0fnu";
-    case ScalarType::Float4_e2m1fn_x2:
-      return "Float4_e2m1fn_x2";
-    case ScalarType::Undefined:
-      return "Undefined";
     default:
       return "UNKNOWN_SCALAR";
   }
@@ -326,18 +251,6 @@ inline size_t elementSize(ScalarType t) {
 
   switch (t) {
     AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(CASE_ELEMENTSIZE_CASE)
-    case ScalarType::QInt8:
-    case ScalarType::QUInt8:
-    case ScalarType::QUInt4x2:
-    case ScalarType::QUInt2x4:
-    case ScalarType::Bits1x8:
-    case ScalarType::Bits2x4:
-    case ScalarType::Bits4x2:
-    case ScalarType::Bits8:
-      return 1;
-    case ScalarType::QInt32:
-    case ScalarType::Bits16:
-      return 4;
     default:
       TORCH_CHECK(false, "Unknown ScalarType");
   }
@@ -410,7 +323,6 @@ inline bool isSignedType(ScalarType t) {
     // Complex types (treated as signed)
     case ScalarType::ComplexFloat:
     case ScalarType::ComplexDouble:
-    case ScalarType::ComplexHalf:
       return true;
 
     // Signed quantized types (explicitly return true)
@@ -438,22 +350,11 @@ inline bool isSignedType(ScalarType t) {
     case ScalarType::QUInt8:
     case ScalarType::QUInt4x2:
     case ScalarType::QUInt2x4:
-    case ScalarType::Bits1x8:
-    case ScalarType::Bits2x4:
-    case ScalarType::Bits4x2:
-    case ScalarType::Bits8:
-    case ScalarType::Bits16:
       return false;
 
       // Bool is unsigned (using numeric_limits)
       CASE_ISSIGNED(Bool);
 
-    case ScalarType::Float8_e5m2fnuz:
-    case ScalarType::Float8_e4m3fnuz:
-    case ScalarType::Float8_e8m0fnu:
-    case ScalarType::Float4_e2m1fn_x2:
-      return true;
-
     // Invalid/undefined types - should not happen in normal usage
     // If this is hit, it indicates a programming error or unsupported type
     case ScalarType::Undefined:
diff --git a/paddle/phi/api/include/compat/c10/core/Stream.cpp b/paddle/phi/api/include/compat/c10/core/Stream.cpp
index 9a52b8c9f9f05d..60873f6e05a93c 100644
--- a/paddle/phi/api/include/compat/c10/core/Stream.cpp
+++ b/paddle/phi/api/include/compat/c10/core/Stream.cpp
@@ -44,9 +44,11 @@ void* Stream::native_handle() const {
     return reinterpret_cast<void*>(static_cast<intptr_t>(id_));
   }
 #endif
-  PADDLE_THROW(::common::errors::Unimplemented(
-      "c10::Stream::native_handle() is not supported for device type %d",
-      static_cast<int>(device_type())));
+  // Match PyTorch error message format for unsupported device types
+  PD_CHECK(false,
+           "native_handle() is not supported for this device type (",
+           static_cast<int>(device_type()),
+           ")");
 }
 
 bool Stream::query() const {
diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h
index 58912130daf303..f68e863eb931dd 100644
--- a/paddle/phi/api/include/compat/c10/core/Stream.h
+++ b/paddle/phi/api/include/compat/c10/core/Stream.h
@@ -90,9 +90,8 @@ class Stream final {
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Stream& s) {
-  os << "Stream(device_type=" << static_cast<int>(s.device_type())
-     << ", device_index=" << static_cast<int>(s.device_index())
-     << ", id=" << s.id() << ")";
+  // Format: "stream {id} on device {device_type}:{device_index}"
+  os << "stream " << s.id() << " on device " << s.device();
   return os;
 }
 
@@ -106,3 +105,7 @@ struct hash<c10::Stream> {
   }
 };
 }  // namespace std
+
+namespace at {
+using c10::Stream;
+}
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp
new file mode 100644
index 00000000000000..482bf7036d7cea
--- /dev/null
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp
@@ -0,0 +1,44 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <c10/cuda/CUDAFunctions.h>
+
+namespace c10::cuda {
+
+c10::DeviceIndex device_count() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  return phi::backends::gpu::GetGPUDeviceCount();
+#else
+  // Return 0 instead of throwing to match PyTorch API semantics
+  // at::cuda::is_available() relies on this returning 0/false
+  return 0;
+#endif
+}
+
+void device_synchronize() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  int curr_device_id = paddle::platform::GetCurrentDeviceId();
+  paddle::platform::SetDeviceId(curr_device_id);
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#endif
+#else
+  PADDLE_THROW(common::errors::Unavailable(
+      "Paddle is not compiled with CUDA. Cannot visit device synchronize."));
+#endif
+}
+
+}  // namespace c10::cuda
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
index 086c3f486cc6ae..25a0301783b606 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
@@ -18,37 +18,21 @@
 
 namespace c10::cuda {
 
-inline c10::DeviceIndex device_count() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  return phi::backends::gpu::GetGPUDeviceCount();
-#else
-  // Return 0 instead of throwing to match PyTorch API semantics
-  // at::cuda::is_available() relies on this returning 0/false
-  return 0;
-#endif
-}
+c10::DeviceIndex device_count();
 
-inline void device_synchronize() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  int curr_device_id = paddle::platform::GetCurrentDeviceId();
-  paddle::platform::SetDeviceId(curr_device_id);
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#endif
-#else
-  PADDLE_THROW(common::errors::Unavailable(
-      "Paddle is not compiled with CUDA. Cannot visit device synchronize."));
-#endif
-}
+void device_synchronize();
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void __inline__ stream_synchronize(gpuStream_t stream) {
   phi::backends::gpu::GpuStreamSync(stream);
 }
+#endif
+
 }  // namespace c10::cuda
 
 namespace at::cuda {
 using c10::cuda::device_synchronize;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 using c10::cuda::stream_synchronize;
+#endif
 }  // namespace at::cuda
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h b/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h
index ce819e69e64932..8f2193ef46ae0e 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h
@@ -19,16 +19,43 @@
 #pragma once
 
 #include <c10/core/Device.h>
+#include <c10/util/Exception.h>
+
 #include <optional>
+
 #include "paddle/phi/core/platform/cuda_device_guard.h"
 
 namespace c10::cuda {
+
+namespace detail {
+
+inline Device current_cuda_device() {
+  return Device(kCUDA, phi::backends::gpu::GetCurrentDeviceId());
+}
+
+inline Device normalize_cuda_device(Device device) {
+  TORCH_CHECK(device.is_cuda(), "Expected a CUDA device, but got ", device);
+  return Device(kCUDA, device.has_index() ? device.index() : 0);
+}
+
+}  // namespace detail
+
 struct CUDAGuard {
   explicit CUDAGuard() = delete;  // NOLINT
 
-  explicit CUDAGuard(DeviceIndex device_index) : guard_(device_index) {}
+  explicit CUDAGuard(DeviceIndex device_index)
+      : original_device_(detail::current_cuda_device()),
+        current_device_(original_device_),
+        guard_() {
+    set_index(device_index);
+  }
 
-  explicit CUDAGuard(Device device) : guard_(device._PD_GetInner()) {}
+  explicit CUDAGuard(Device device)
+      : original_device_(detail::current_cuda_device()),
+        current_device_(original_device_),
+        guard_() {
+    set_device(device);
+  }
 
   CUDAGuard(const CUDAGuard&) = delete;
   CUDAGuard& operator=(const CUDAGuard&) = delete;
@@ -37,79 +64,86 @@ struct CUDAGuard {
   CUDAGuard& operator=(CUDAGuard&& other) = delete;
   ~CUDAGuard() = default;
 
-  void set_device(Device device) { guard_.SetDevice(device._PD_GetInner()); }
+  void set_device(Device device) {
+    current_device_ = detail::normalize_cuda_device(device);
+    guard_.SetDevice(current_device_._PD_GetInner());
+  }
 
   void reset_device(Device device) { set_device(device); }
 
   void set_index(DeviceIndex device_index) {
+    current_device_ = Device(kCUDA, device_index);
     guard_.SetDeviceIndex(device_index);
   }
 
-  Device current_device() const {
-    return c10::Device(c10::kCUDA, phi::backends::gpu::GetCurrentDeviceId());
-  }
+  Device original_device() const { return original_device_; }
+
+  Device current_device() const { return current_device_; }
 
  private:
+  Device original_device_;
+  Device current_device_;
   paddle::platform::CUDADeviceGuard guard_;
 };
 
 struct OptionalCUDAGuard {
   OptionalCUDAGuard() = default;
 
-  explicit OptionalCUDAGuard(std::optional<Device> device_opt) : guard_() {
+  explicit OptionalCUDAGuard(std::optional<Device> device_opt) {
     if (device_opt.has_value()) {
-      guard_.emplace(device_opt.value()._PD_GetInner());
+      set_device(device_opt.value());
     }
   }
 
-  explicit OptionalCUDAGuard(std::optional<DeviceIndex> device_index_opt)
-      : guard_() {
+  explicit OptionalCUDAGuard(std::optional<DeviceIndex> device_index_opt) {
     if (device_index_opt.has_value()) {
-      guard_.emplace(device_index_opt.value());
+      set_index(device_index_opt.value());
     }
   }
 
-  // Copy is not allowed
   OptionalCUDAGuard(const OptionalCUDAGuard&) = delete;
   OptionalCUDAGuard& operator=(const OptionalCUDAGuard&) = delete;
 
   OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete;
-
   OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete;
   ~OptionalCUDAGuard() = default;
 
   void set_device(Device device) {
-    if (!guard_.has_value()) {
-      guard_.emplace(device._PD_GetInner());
-    } else {
-      guard_->SetDevice(device._PD_GetInner());
-    }
+    const Device normalized = detail::normalize_cuda_device(device);
+    init_if_needed();
+    guard_->SetDevice(normalized._PD_GetInner());
+    current_device_ = normalized;
   }
 
-  void reset_device(Device device) {
-    if (!guard_.has_value()) {
-      guard_.emplace(device._PD_GetInner());
-    } else {
-      guard_->SetDevice(device._PD_GetInner());
-    }
-  }
+  void reset_device(Device device) { set_device(device); }
 
   void set_index(DeviceIndex device_index) {
-    if (!guard_.has_value()) {
-      guard_.emplace(device_index);
-    } else {
-      guard_->SetDeviceIndex(device_index);
-    }
+    init_if_needed();
+    guard_->SetDeviceIndex(device_index);
+    current_device_ = Device(kCUDA, device_index);
   }
 
-  std::optional<Device> current_device() const {
-    return guard_.has_value()
-               ? std::make_optional(c10::Device(
-                     c10::kCUDA, phi::backends::gpu::GetCurrentDeviceId()))
-               : std::nullopt;
+  std::optional<Device> original_device() const { return original_device_; }
+
+  std::optional<Device> current_device() const { return current_device_; }
+
+  void reset() {
+    guard_.reset();
+    original_device_.reset();
+    current_device_.reset();
   }
 
  private:
+  void init_if_needed() {
+    if (!guard_.has_value()) {
+      original_device_ = detail::current_cuda_device();
+      current_device_ = original_device_;
+      guard_.emplace();
+    }
+  }
+
+  std::optional<Device> original_device_;
+  std::optional<Device> current_device_;
   std::optional<paddle::platform::CUDADeviceGuard> guard_;
 };
 
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
index f88dd043317252..8f55bdef3f9a21 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
@@ -18,9 +18,14 @@
 #include <c10/core/Stream.h>
 #include <c10/cuda/CUDAException.h>
 
+#include <algorithm>
 #include <array>
 #include <atomic>
+#include <functional>
 #include <mutex>
+#include <ostream>
+#include <tuple>
+
 #include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/place.h"
@@ -30,16 +35,13 @@ namespace c10::cuda {
 
 using StreamId = int64_t;
 
-// ── Per-device stream pool and per-thread current stream ─────────────────────
+static constexpr int max_compile_time_stream_priorities = 4;
 
 namespace detail {
 
 constexpr int kStreamsPerPool = 32;
-// Upper bound for static pool/TLS arrays. 64 covers all current CUDA hardware.
 constexpr int kMaxDevices = 64;
 
-// Device count is invariant after CUDA initialization; cache it to avoid
-// repeated driver calls on the hot path.
 inline int gpu_device_count() {
   static const int count = phi::backends::gpu::GetGPUDeviceCount();
   return count;
@@ -74,7 +76,7 @@ inline StreamPoolState& get_pool(int device_index) {
 inline void init_pool(int device_index, StreamPoolState* state) {
   phi::backends::gpu::GPUDeviceGuard guard(device_index);
   int lo_pri = 0, hi_pri = 0;
-  cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri);
+  C10_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
   for (int i = 0; i < kStreamsPerPool; ++i) {
     C10_CUDA_CHECK(cudaStreamCreateWithPriority(
         &state->low_priority[i], cudaStreamNonBlocking, lo_pri));
@@ -83,9 +85,6 @@ inline void init_pool(int device_index, StreamPoolState* state) {
   }
 }
 
-// Per-thread, per-device current stream state.
-// thread_local inside an inline function is ODR-safe across translation units
-// (C++11 §3.2): all TUs share the same thread-local instance per thread.
 struct TLSStreamState {
   cudaStream_t streams[kMaxDevices]{};
   bool has_stream[kMaxDevices]{};
@@ -98,22 +97,24 @@ inline TLSStreamState& get_tls() {
 
 }  // namespace detail
 
-// ── CUDAStream ───────────────────────────────────────────────────────────────
-
 class CUDAStream {
  public:
+  enum Unchecked { UNCHECKED };
+
   CUDAStream() = delete;
 
   explicit CUDAStream(Stream stream) : stream_(stream) {
     TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
   }
 
+  explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {}
+
   bool operator==(const CUDAStream& other) const noexcept {
-    return stream_ == other.stream_;
+    return unwrap() == other.unwrap();
   }
 
   bool operator!=(const CUDAStream& other) const noexcept {
-    return stream_ != other.stream_;
+    return unwrap() != other.unwrap();
   }
 
   StreamId id() const { return stream_.id(); }
@@ -122,6 +123,21 @@ class CUDAStream {
 
   operator Stream() const { return unwrap(); }
 
+  bool query() const { return unwrap().query(); }
+
+  void synchronize() const { unwrap().synchronize(); }
+
+  int priority() const {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    phi::backends::gpu::GPUDeviceGuard guard(device_index());
+    int priority = 0;
+    C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
+    return priority;
+#else
+    return 0;
+#endif
+  }
+
   cudaStream_t stream() const {
     return reinterpret_cast<cudaStream_t>(stream_.id());
   }
@@ -134,17 +150,34 @@ class CUDAStream {
 
   Device device() const { return Device(DeviceType::CUDA, device_index()); }
 
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // stream()
-  cudaStream_t raw_stream() const { return stream(); }
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  static CUDAStream unpack3(StreamId stream_id,
+                            DeviceIndex device_index,
+                            DeviceType device_type) {
+    return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  static std::tuple<int, int> priority_range() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    int least_priority = 0;
+    int greatest_priority = 0;
+    C10_CUDA_CHECK(
+        cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+    greatest_priority =
+        std::max(-max_compile_time_stream_priorities + 1, greatest_priority);
+    return std::make_tuple(least_priority, greatest_priority);
+#else
+    return std::make_tuple(0, 0);
+#endif
+  }
 
  private:
   Stream stream_;
 };
 
-// Build a CUDAStream from a raw platform stream handle and a device index.
-// The handle is encoded as a StreamId via reinterpret_cast, matching Paddle's
-// phi::Stream / phi::CUDAStream convention.
 inline CUDAStream make_cuda_stream(cudaStream_t raw,
                                    c10::DeviceIndex device_index) {
   c10::StreamId sid =
@@ -155,14 +188,6 @@ inline CUDAStream make_cuda_stream(cudaStream_t raw,
                   sid));
 }
 
-/**
- * Get the current CUDA stream for the given device (or the current device if
- * device_index == -1).
- *
- * Returns the per-thread current stream if one has been set via
- * setCurrentCUDAStream() for this thread and device; otherwise falls back to
- * Paddle's default stream for the device.
- */
 inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) {
   if (device_index == -1) {
     device_index = phi::backends::gpu::GetCurrentDeviceId();
@@ -181,29 +206,21 @@ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) {
   return make_cuda_stream(raw, device_index);
 }
 
-/**
- * Get a stream from the per-device pool in round-robin fashion.
- * Returns a high priority stream if isHighPriority is true.
- *
- * The pool is lazily initialized on first use for each device.  Each device
- * has kStreamsPerPool low-priority and kStreamsPerPool high-priority streams
- * that are reused round-robin.  Pool streams are always distinct from the
- * current stream, enabling cross-stream dependency management and correct
- * record_stream lifetime semantics.
- */
 inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
+                                    c10::DeviceIndex device_index = -1);
+
+inline CUDAStream getStreamFromPool(const int priority,
                                     c10::DeviceIndex device_index = -1) {
   if (device_index == -1) {
     device_index = phi::backends::gpu::GetCurrentDeviceId();
   }
-  // get_pool also performs bounds-checking on device_index.
   auto& state = detail::get_pool(device_index);
   std::call_once(state.init_flag, [device_index, &state]() {
     detail::init_pool(device_index, &state);
   });
 
   cudaStream_t raw;
-  if (isHighPriority) {
+  if (priority < 0) {
     raw = state.high_priority[state.hp_counter.fetch_add(1) %
                               detail::kStreamsPerPool];
   } else {
@@ -213,14 +230,17 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
   return make_cuda_stream(raw, device_index);
 }
 
-/**
- * Set the current CUDA stream for the device of the given stream in the
- * calling thread.
- *
- * Implements per-thread, per-device current stream semantics: the change is
- * local to the calling OS thread and does not affect any shared state such as
- * Paddle's GPUContext.  Other threads continue to see their own current stream.
- */
+inline CUDAStream getStreamFromPool(const bool isHighPriority,
+                                    c10::DeviceIndex device_index) {
+  return getStreamFromPool(isHighPriority ? -1 : 0, device_index);
+}
+
+inline CUDAStream getStreamFromExternal(cudaStream_t ext_stream,
+                                        c10::DeviceIndex device_index) {
+  detail::check_device_index(device_index);
+  return make_cuda_stream(ext_stream, device_index);
+}
+
 inline void setCurrentCUDAStream(CUDAStream stream) {
   c10::DeviceIndex idx = stream.unwrap().device_index();
   detail::check_device_index(idx);
@@ -234,22 +254,30 @@ inline CUDAStream getDefaultCUDAStream(c10::DeviceIndex device_index = -1) {
     device_index = phi::backends::gpu::GetCurrentDeviceId();
   }
   detail::check_device_index(device_index);
-  // The default CUDA stream is always the null stream (cudaStreamDefault,
-  // handle == 0), regardless of any per-thread current stream override.
-  // This matches PyTorch semantics where getDefaultCUDAStream() returns the
-  // fixed device-level default stream, while getCurrentCUDAStream() returns
-  // the per-thread current stream (which may differ after
-  // setCurrentCUDAStream).
   return CUDAStream(c10::Stream(
       c10::Stream::DEFAULT, c10::Device(c10::DeviceType::CUDA, device_index)));
 }
 
+inline std::ostream& operator<<(std::ostream& stream, const CUDAStream& s) {
+  return stream << s.unwrap();
+}
+
 }  // namespace c10::cuda
 
+namespace std {
+template <>
+struct hash<c10::cuda::CUDAStream> {
+  size_t operator()(c10::cuda::CUDAStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+}  // namespace std
+
 namespace at::cuda {
 using c10::cuda::CUDAStream;
 using c10::cuda::getCurrentCUDAStream;
 using c10::cuda::getDefaultCUDAStream;
+using c10::cuda::getStreamFromExternal;
 using c10::cuda::getStreamFromPool;
 using c10::cuda::setCurrentCUDAStream;
 }  // namespace at::cuda
diff --git a/paddle/phi/api/include/compat/c10/util/Exception.h b/paddle/phi/api/include/compat/c10/util/Exception.h
index 70c8f656417296..2651c83646044c 100644
--- a/paddle/phi/api/include/compat/c10/util/Exception.h
+++ b/paddle/phi/api/include/compat/c10/util/Exception.h
@@ -34,16 +34,16 @@
 namespace c10 {
 #define TORCH_CHECK(COND, ...) PD_CHECK(COND, ##__VA_ARGS__);
 #define TORCH_INTERNAL_ASSERT(COND, ...) PD_CHECK(COND, ##__VA_ARGS__);
-#define TORCH_CHECK_OP(val1, val2, op)                                    \
-  do {                                                                    \
-    auto&& _val1 = (val1);                                                \
-    auto&& _val2 = (val2);                                                \
-    if (!(_val1 op _val2)) {                                              \
-      std::ostringstream _result;                                         \
-      _result << "Expected " #val1 " " #op " " #val2 " (" << _val1 << " " \
-              << #op << " " << _val2 << "), but got false";               \
-      PD_THROW(_result.str());                                            \
-    }                                                                     \
+#define TORCH_CHECK_OP(val1, val2, op)                                  \
+  do {                                                                  \
+    auto&& _val1 = (val1);                                              \
+    auto&& _val2 = (val2);                                              \
+    if (!(_val1 op _val2)) {                                            \
+      std::ostringstream _result;                                       \
+      _result << "Check failed: " #val1 " " #op " " #val2 " (" << _val1 \
+              << " vs. " << _val2 << "). ";                             \
+      PD_THROW(_result.str());                                          \
+    }                                                                   \
   } while (false);
 
 // Check for a given boolean condition.
diff --git a/paddle/phi/api/include/compat/torch/csrc/api/include/torch/cuda.h b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/cuda.h
index 3cf18fd4f22574..4eb38ceecc681f 100644
--- a/paddle/phi/api/include/compat/torch/csrc/api/include/torch/cuda.h
+++ b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/cuda.h
@@ -28,7 +28,5 @@ void synchronize(int64_t device_index = -1);
 
 }  // namespace torch::cuda
 namespace at::cuda {
-using torch::cuda::device_count;
-using torch::cuda::is_available;
 using torch::cuda::synchronize;
 }  // namespace at::cuda
diff --git a/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h
new file mode 100644
index 00000000000000..f745837944df17
--- /dev/null
+++ b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+/// Indicates the major version of LibTorch.
+#define TORCH_VERSION_MAJOR 2
+
+/// Indicates the minor version of LibTorch.
+#define TORCH_VERSION_MINOR 10
+
+/// Indicates the patch version of LibTorch.
+#define TORCH_VERSION_PATCH 0
+
+/// Indicates the ABI version tag of LibTorch.
+#define TORCH_VERSION_ABI_TAG 0
+
+/// Indicates the version of LibTorch as a string literal.
+#define TORCH_VERSION "2.10.0"
+
+/// Indicates the ABI version of LibTorch as a single uint64.
+/// [ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ]
+/// [ MAJ  ][ MIN  ][ PATCH][                              ABI TAG ]
+#define TORCH_ABI_VERSION                                                 \
+  (((0ULL + TORCH_VERSION_MAJOR) << 56) |                                 \
+   ((0ULL + TORCH_VERSION_MINOR) << 48) | /* NOLINT(whitespace/indent) */ \
+   ((0ULL + TORCH_VERSION_PATCH) << 40) | /* NOLINT(whitespace/indent) */ \
+   ((0ULL + TORCH_VERSION_ABI_TAG) << 0)) /* NOLINT(whitespace/indent) */
diff --git a/test/cpp/compat/ATen_TensorAccessor_test.cc b/test/cpp/compat/ATen_TensorAccessor_test.cc
index cb1eaaf3c8add0..4d6b8e9648bcfa 100644
--- a/test/cpp/compat/ATen_TensorAccessor_test.cc
+++ b/test/cpp/compat/ATen_TensorAccessor_test.cc
@@ -198,7 +198,7 @@ TEST(TensorAccessorTest, PackedAccessorWithIntType) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(TensorAccessorTest, PackedAccessorCUDA) {
-  if (at::cuda::is_available()) {
+  if (torch::cuda::is_available()) {
     // Create CUDA tensor
     at::Tensor tensor =
         at::arange(12, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA))
diff --git a/test/cpp/compat/ATen_factory_default_dtype_test.cc b/test/cpp/compat/ATen_factory_default_dtype_test.cc
index 70dd3ab02cdeeb..08cc8d4c2d9a3d 100644
--- a/test/cpp/compat/ATen_factory_default_dtype_test.cc
+++ b/test/cpp/compat/ATen_factory_default_dtype_test.cc
@@ -51,19 +51,59 @@ TEST(ATenFactoryDefaultDtypeTest, EmptyNulloptDtypeUsesCurrentDefault) {
   ASSERT_EQ(tensor.sizes(), c10::IntArrayRef({2, 3}));
 }
 
-TEST(ATenFactoryDefaultDtypeTest, ArangeNulloptDtypeUsesCurrentDefault) {
+TEST(ATenFactoryDefaultDtypeTest, ArangeOmittedDtypeUsesLongForIntegralInputs) {
   DefaultDtypeGuard guard(at::kDouble);
 
-  at::Tensor end_only =
+  at::Tensor end_only_default = at::arange(5);
+  at::Tensor start_end_default = at::arange(1, 6);
+  at::Tensor start_end_step_default = at::arange(1, 7, 2);
+  at::Tensor end_only_nullopt =
       at::arange(5, std::nullopt, std::nullopt, at::kCPU, false);
-  at::Tensor start_end =
+  at::Tensor start_end_nullopt =
       at::arange(1, 6, std::nullopt, std::nullopt, at::kCPU, false);
-  at::Tensor start_end_step =
+  at::Tensor start_end_step_nullopt =
       at::arange(1, 7, 2, std::nullopt, std::nullopt, at::kCPU, false);
 
-  ASSERT_EQ(end_only.scalar_type(), at::kDouble);
-  ASSERT_EQ(start_end.scalar_type(), at::kDouble);
-  ASSERT_EQ(start_end_step.scalar_type(), at::kDouble);
+  ASSERT_EQ(end_only_default.scalar_type(), at::kLong);
+  ASSERT_EQ(start_end_default.scalar_type(), at::kLong);
+  ASSERT_EQ(start_end_step_default.scalar_type(), at::kLong);
+  ASSERT_EQ(end_only_nullopt.scalar_type(), at::kLong);
+  ASSERT_EQ(start_end_nullopt.scalar_type(), at::kLong);
+  ASSERT_EQ(start_end_step_nullopt.scalar_type(), at::kLong);
+  ASSERT_EQ(end_only_default.data_ptr<int64_t>()[4], 4);
+  ASSERT_EQ(start_end_default.data_ptr<int64_t>()[0], 1);
+  ASSERT_EQ(start_end_step_default.data_ptr<int64_t>()[2], 5);
+  ASSERT_EQ(end_only_nullopt.data_ptr<int64_t>()[4], 4);
+  ASSERT_EQ(start_end_nullopt.data_ptr<int64_t>()[0], 1);
+  ASSERT_EQ(start_end_step_nullopt.data_ptr<int64_t>()[2], 5);
+}
+
+TEST(ATenFactoryDefaultDtypeTest,
+     ArangeOmittedDtypeUsesCurrentDefaultForFloatingInputs) {
+  DefaultDtypeGuard guard(at::kDouble);
+
+  at::Tensor end_only_default = at::arange(5.0);
+  at::Tensor start_end_default = at::arange(1.0, 6.0);
+  at::Tensor start_end_step_default = at::arange(1.0, 7.0, 2.0);
+  at::Tensor end_only_nullopt =
+      at::arange(5.0, std::nullopt, std::nullopt, at::kCPU, false);
+  at::Tensor start_end_nullopt =
+      at::arange(1.0, 6.0, std::nullopt, std::nullopt, at::kCPU, false);
+  at::Tensor start_end_step_nullopt =
+      at::arange(1.0, 7.0, 2.0, std::nullopt, std::nullopt, at::kCPU, false);
+
+  ASSERT_EQ(end_only_default.scalar_type(), at::kDouble);
+  ASSERT_EQ(start_end_default.scalar_type(), at::kDouble);
+  ASSERT_EQ(start_end_step_default.scalar_type(), at::kDouble);
+  ASSERT_EQ(end_only_nullopt.scalar_type(), at::kDouble);
+  ASSERT_EQ(start_end_nullopt.scalar_type(), at::kDouble);
+  ASSERT_EQ(start_end_step_nullopt.scalar_type(), at::kDouble);
+  ASSERT_DOUBLE_EQ(end_only_default.data_ptr<double>()[4], 4.0);
+  ASSERT_DOUBLE_EQ(start_end_default.data_ptr<double>()[0], 1.0);
+  ASSERT_DOUBLE_EQ(start_end_step_default.data_ptr<double>()[2], 5.0);
+  ASSERT_DOUBLE_EQ(end_only_nullopt.data_ptr<double>()[4], 4.0);
+  ASSERT_DOUBLE_EQ(start_end_nullopt.data_ptr<double>()[0], 1.0);
+  ASSERT_DOUBLE_EQ(start_end_step_nullopt.data_ptr<double>()[2], 5.0);
 }
 
 TEST(ATenFactoryDefaultDtypeTest, FullNulloptDtypeUsesCurrentDefault) {
diff --git a/test/cpp/compat/ATen_pin_memory_creation_test.cc b/test/cpp/compat/ATen_pin_memory_creation_test.cc
index 4ead12a484f79f..8d89d00b07b083 100644
--- a/test/cpp/compat/ATen_pin_memory_creation_test.cc
+++ b/test/cpp/compat/ATen_pin_memory_creation_test.cc
@@ -123,8 +123,16 @@ TEST(ATenPinMemoryCreationTest, EyePinMemoryWithCUDADeviceErrors) {
                std::exception);
 }
 
-TEST(ATenPinMemoryCreationTest, ArangePinMemory) {
+TEST(ATenPinMemoryCreationTest, ArangePinMemoryOverloads) {
   SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto end_only_by_options =
+      at::arange(10, at::TensorOptions().dtype(at::kFloat).pinned_memory(true));
+  AssertPinned(end_only_by_options);
+
+  auto end_only_by_args =
+      at::arange(10, at::kFloat, std::nullopt, at::kCPU, true);
+  AssertPinned(end_only_by_args);
+
   auto by_options = at::arange(
       0, 10, at::TensorOptions().dtype(at::kFloat).pinned_memory(true));
   AssertPinned(by_options);
@@ -132,14 +140,30 @@ TEST(ATenPinMemoryCreationTest, ArangePinMemory) {
   auto by_args = at::arange(0, 10, at::kFloat, std::nullopt, at::kCPU, true);
   AssertPinned(by_args);
 
+  auto step_by_options = at::arange(
+      0, 10, 2, at::TensorOptions().dtype(at::kFloat).pinned_memory(true));
+  AssertPinned(step_by_options);
+
+  auto step_by_args =
+      at::arange(0, 10, 2, at::kFloat, std::nullopt, at::kCPU, true);
+  AssertPinned(step_by_args);
+
   auto no_pin = at::arange(0, 10, at::kFloat, std::nullopt, at::kCPU, false);
   AssertNotPinned(no_pin);
+
+  auto step_no_pin =
+      at::arange(0, 10, 2, at::kFloat, std::nullopt, at::kCPU, false);
+  AssertNotPinned(step_no_pin);
 }
 
 TEST(ATenPinMemoryCreationTest, ArangePinMemoryWithCUDADeviceErrors) {
   SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  ASSERT_THROW(at::arange(10, at::kFloat, std::nullopt, at::kCUDA, true),
+               std::exception);
   ASSERT_THROW(at::arange(0, 10, at::kFloat, std::nullopt, at::kCUDA, true),
                std::exception);
+  ASSERT_THROW(at::arange(0, 10, 2, at::kFloat, std::nullopt, at::kCUDA, true),
+               std::exception);
 }
 
 TEST(ATenPinMemoryCreationTest, EmptyLikePinMemory) {
diff --git a/test/cpp/compat/ATen_resize_test.cc b/test/cpp/compat/ATen_resize_test.cc
index 0ad6b09f58eb2b..e39fc40c58a176 100644
--- a/test/cpp/compat/ATen_resize_test.cc
+++ b/test/cpp/compat/ATen_resize_test.cc
@@ -24,8 +24,8 @@
 #include "torch/all.h"
 
 // ======================== resize_ tests ========================
-// Note: Paddle's resize_ is implemented via reshape, which requires
-// total element count to remain unchanged.
+// Note: compat resize_ uses reshape when numel is unchanged, and falls back to
+// set_ for storage-changing cases so repeated resize_ calls remain stable.
 
 TEST(TensorResizeTest, ResizeBasic) {
   // Create a 2x3 tensor
@@ -109,6 +109,34 @@ TEST(TensorResizeTest, ResizePreservesData) {
   ASSERT_FLOAT_EQ(data[5], 5.0f);
 }
 
+TEST(TensorResizeTest, ResizeShrinkDifferentNumel) {
+  at::Tensor t = at::arange(24, at::kFloat).reshape({2, 3, 4});
+
+  t.resize_({4, 5});
+
+  ASSERT_EQ(t.sizes()[0], 4);
+  ASSERT_EQ(t.sizes()[1], 5);
+
+  float* data = t.data_ptr<float>();
+  for (int i = 0; i < 20; ++i) {
+    ASSERT_FLOAT_EQ(data[i], static_cast<float>(i));
+  }
+}
+
+TEST(TensorResizeTest, ResizeGrowDifferentNumelPreservesPrefix) {
+  at::Tensor t = at::arange(6, at::kFloat).reshape({2, 3});
+
+  t.resize_({2, 5});
+
+  ASSERT_EQ(t.sizes()[0], 2);
+  ASSERT_EQ(t.sizes()[1], 5);
+
+  float* data = t.data_ptr<float>();
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_FLOAT_EQ(data[i], static_cast<float>(i));
+  }
+}
+
 TEST(TensorResizeTest, ResizeReturnReference) {
   // Create a tensor
   at::Tensor t = at::zeros({2, 3});
diff --git a/test/cpp/compat/c10_Device_test.cc b/test/cpp/compat/c10_Device_test.cc
index 08331a68162194..04dc88b4d0e2e6 100644
--- a/test/cpp/compat/c10_Device_test.cc
+++ b/test/cpp/compat/c10_Device_test.cc
@@ -16,6 +16,7 @@
 #include <c10/core/DeviceType.h>
 
 #include <sstream>
+#include <unordered_map>
 
 #include "gtest/gtest.h"
 
@@ -68,6 +69,9 @@ TEST(DeviceTypeCompatTest, DeviceTypeConversionAndStreamOperator) {
   std::ostringstream invalid_os;
   invalid_os << static_cast<c10::DeviceType>(99);
   EXPECT_TRUE(invalid_os.str().empty());
+
+  EXPECT_EQ(c10::DeviceType::PrivateUse1, c10::DeviceType::CUSTOM);
+  EXPECT_EQ(c10::kPrivateUse1, c10::DeviceType::PrivateUse1);
 }
 
 TEST(DeviceCompatTest, DeviceParseAndPlaceBranches) {
@@ -122,3 +126,47 @@ TEST(DeviceCompatTest, DeviceParseAndPlaceBranches) {
   os << cuda;
   EXPECT_EQ(os.str(), "cuda:3");
 }
+
+TEST(DeviceCompatTest, DeviceInterfaceParity) {
+  c10::Device cpu(c10::kCPU);
+  c10::Device cuda(c10::kCUDA, 0);
+  c10::Device xpu(c10::kXPU, 1);
+  c10::Device ipu(c10::kIPU, 2);
+  c10::Device privateuse(c10::kPrivateUse1, 4);
+
+  EXPECT_TRUE(cpu.is_cpu());
+  EXPECT_TRUE(cuda.is_cuda());
+  EXPECT_TRUE(xpu.is_xpu());
+  EXPECT_TRUE(ipu.is_ipu());
+  EXPECT_TRUE(privateuse.is_privateuseone());
+  EXPECT_FALSE(privateuse.is_mps());
+  EXPECT_FALSE(privateuse.is_hip());
+  EXPECT_FALSE(privateuse.is_meta());
+  EXPECT_TRUE(cpu.supports_as_strided());
+  EXPECT_FALSE(ipu.supports_as_strided());
+
+  c10::Device cpu_with_index(c10::kCPU);
+  cpu_with_index.set_index(0);
+  EXPECT_EQ(cpu_with_index.index(), 0);
+  EXPECT_EQ(cpu_with_index.str(), "cpu:0");
+
+  c10::Device cuda_with_index(c10::kCUDA);
+  cuda_with_index.set_index(2);
+  EXPECT_EQ(cuda_with_index.index(), 2);
+  EXPECT_EQ(cuda_with_index.str(), "cuda:2");
+
+  EXPECT_NE(cpu, cuda);
+  EXPECT_EQ(cuda, c10::Device(c10::kCUDA, 0));
+  EXPECT_EQ(privateuse.str(), "privateuseone:4");
+  EXPECT_TRUE(c10::Device("privateuseone:7").is_privateuseone());
+
+  EXPECT_THROW(c10::Device("cuda:-1"), ::std::exception);
+  EXPECT_THROW(c10::Device("cuda:01"), ::std::exception);
+  EXPECT_THROW(c10::Device("cuda:1:2"), ::std::exception);
+
+  std::unordered_map<c10::Device, int> device_map;
+  device_map.emplace(c10::Device(c10::kCUDA, 0), 7);
+  device_map.emplace(c10::Device(c10::kCPU), 3);
+  EXPECT_EQ(device_map.at(c10::Device(c10::kCUDA, 0)), 7);
+  EXPECT_EQ(device_map.at(c10::Device(c10::kCPU)), 3);
+}
diff --git a/test/cpp/compat/c10_ScalarType_test.cc b/test/cpp/compat/c10_ScalarType_test.cc
index ea0895412f72df..a373ea81841047 100644
--- a/test/cpp/compat/c10_ScalarType_test.cc
+++ b/test/cpp/compat/c10_ScalarType_test.cc
@@ -25,7 +25,6 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAGuard.h>
 #endif
-#include <sstream>
 #include "ATen/ATen.h"
 #include "gtest/gtest.h"
 #include "paddle/phi/common/float16.h"
@@ -91,78 +90,3 @@ TEST(TensorBaseTest, TypeCheckingAPIs) {
   ASSERT_FALSE(uint8_tensor.is_signed());
   ASSERT_FALSE(bool_tensor.is_signed());
 }
-
-TEST(ScalarTypeCompatTest, ScalarTypeUtilityBranches) {
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Bits1x8), "Bits1x8");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Bits16), "Bits16");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Float8_e5m2fnuz),
-               "Float8_e5m2fnuz");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Float8_e4m3fnuz),
-               "Float8_e4m3fnuz");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Float8_e8m0fnu),
-               "Float8_e8m0fnu");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Float4_e2m1fn_x2),
-               "Float4_e2m1fn_x2");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Undefined), "Undefined");
-  EXPECT_STREQ(c10::toString(static_cast<c10::ScalarType>(-1)),
-               "UNKNOWN_SCALAR");
-
-  EXPECT_EQ(c10::elementSize(c10::ScalarType::QInt8), static_cast<size_t>(1));
-  EXPECT_EQ(c10::elementSize(c10::ScalarType::QUInt4x2),
-            static_cast<size_t>(1));
-  EXPECT_EQ(c10::elementSize(c10::ScalarType::QInt32), static_cast<size_t>(4));
-  EXPECT_EQ(c10::elementSize(c10::ScalarType::Bits16), static_cast<size_t>(4));
-  EXPECT_THROW(c10::elementSize(c10::ScalarType::Undefined), ::std::exception);
-
-  EXPECT_TRUE(c10::isIntegralType(c10::ScalarType::Bool, true));
-  EXPECT_FALSE(c10::isIntegralType(c10::ScalarType::Bool, false));
-  EXPECT_TRUE(c10::isFloat8Type(c10::ScalarType::Float8_e5m2));
-  EXPECT_FALSE(c10::isFloat8Type(c10::ScalarType::Float8_e4m3fnuz));
-  EXPECT_TRUE(c10::isReducedFloatingType(c10::ScalarType::BFloat16));
-  EXPECT_TRUE(c10::isFloatingType(c10::ScalarType::Float));
-  EXPECT_FALSE(c10::isComplexType(c10::ScalarType::ComplexHalf));
-
-  EXPECT_TRUE(c10::isSignedType(c10::ScalarType::Int1));
-  EXPECT_FALSE(c10::isSignedType(c10::ScalarType::UInt3));
-  EXPECT_FALSE(c10::isSignedType(c10::ScalarType::QUInt8));
-  EXPECT_TRUE(c10::isSignedType(c10::ScalarType::Float8_e5m2fnuz));
-  EXPECT_THROW(c10::isSignedType(c10::ScalarType::Undefined), ::std::exception);
-
-  std::ostringstream oss;
-  oss << c10::ScalarType::UInt7;
-  EXPECT_EQ(oss.str(), "UInt7");
-}
-
-TEST(ScalarTypeCompatTest, AdditionalEnumAndPredicateBranches) {
-  EXPECT_STREQ(c10::toString(c10::ScalarType::QInt8), "QInt8");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::QUInt8), "QUInt8");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::QInt32), "QInt32");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::QUInt4x2), "QUInt4x2");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::QUInt2x4), "QUInt2x4");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::ComplexHalf), "ComplexHalf");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Bits2x4), "Bits2x4");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Bits4x2), "Bits4x2");
-  EXPECT_STREQ(c10::toString(c10::ScalarType::Bits8), "Bits8");
-
-  EXPECT_EQ(c10::elementSize(c10::ScalarType::QUInt8), static_cast<size_t>(1));
-  EXPECT_EQ(c10::elementSize(c10::ScalarType::QUInt2x4),
-            static_cast<size_t>(1));
-  EXPECT_EQ(c10::elementSize(c10::ScalarType::Bits2x4), static_cast<size_t>(1));
-  EXPECT_EQ(c10::elementSize(c10::ScalarType::Bits4x2), static_cast<size_t>(1));
-  EXPECT_EQ(c10::elementSize(c10::ScalarType::Bits8), static_cast<size_t>(1));
-
-  EXPECT_TRUE(c10::isIntegralType(c10::ScalarType::UInt64, false));
-  EXPECT_FALSE(c10::isIntegralType(c10::ScalarType::Float, true));
-  EXPECT_TRUE(c10::isFloat8Type(c10::ScalarType::Float8_e4m3fn));
-  EXPECT_TRUE(c10::isReducedFloatingType(c10::ScalarType::Half));
-  EXPECT_FALSE(c10::isReducedFloatingType(c10::ScalarType::Float));
-  EXPECT_TRUE(c10::isFloatingType(c10::ScalarType::Half));
-  EXPECT_TRUE(c10::isComplexType(c10::ScalarType::ComplexFloat));
-
-  EXPECT_TRUE(c10::isSignedType(c10::ScalarType::QInt8));
-  EXPECT_TRUE(c10::isSignedType(c10::ScalarType::ComplexHalf));
-  EXPECT_FALSE(c10::isSignedType(c10::ScalarType::Byte));
-  EXPECT_FALSE(c10::isSignedType(c10::ScalarType::Bool));
-  EXPECT_THROW(c10::isSignedType(c10::ScalarType::NumOptions),
-               ::std::exception);
-}
diff --git a/test/cpp/compat/c10_layout_test.cc b/test/cpp/compat/c10_layout_test.cc
index 6e163aa8ee4c5a..1a0babeb7d3673 100644
--- a/test/cpp/compat/c10_layout_test.cc
+++ b/test/cpp/compat/c10_layout_test.cc
@@ -368,6 +368,9 @@ TEST(SparseConstructorTest, SparseCooTensorInferSize) {
 
   ASSERT_TRUE(sparse.is_sparse());
   ASSERT_EQ(sparse.layout(), c10::kSparse);
+  ASSERT_EQ(sparse.dim(), 2);
+  ASSERT_EQ(sparse.size(0), 3);
+  ASSERT_EQ(sparse.size(1), 3);
 }
 
 TEST(SparseConstructorTest, SparseCooTensorDouble) {
diff --git a/test/cpp/compat/c10_storage_test.cc b/test/cpp/compat/c10_storage_test.cc
index f05083fe88747f..947786fb929870 100644
--- a/test/cpp/compat/c10_storage_test.cc
+++ b/test/cpp/compat/c10_storage_test.cc
@@ -26,8 +26,9 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
 // Forward-declare getCUDADeviceAllocator to avoid include-order conflicts
-// between ATen/cuda/CUDAContextLight.h (defines at::cuda::is_available inline)
-// and torch/cuda.h (adds `using torch::cuda::is_available` to at::cuda).
+// between ATen/cuda/CUDAContextLight.h (defines torch::cuda::is_available
+// inline) and torch/cuda.h (adds `using torch::cuda::is_available` to
+// at::cuda).
 namespace at::cuda {
 c10::Allocator* getCUDADeviceAllocator();
 }  // namespace at::cuda
@@ -242,7 +243,7 @@ TEST(StorageTest, DeviceAndDeviceTypeAPIs) {
   ASSERT_EQ(place.GetType(), phi::AllocationType::CPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (at::cuda::is_available()) {
+  if (torch::cuda::is_available()) {
     at::TensorBase cuda_tensor = at::ones(
         {2, 3}, c10::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
     const c10::Storage& cuda_storage = cuda_tensor.storage();
@@ -731,9 +732,13 @@ TEST(StorageTest, DataPtrHelpersAndAllocatorSimpleDataPtrChecks) {
   dp.unsafe_set_device(c10::Device(c10::DeviceType::CPU));
   ASSERT_EQ(dp.device().type(), c10::DeviceType::CPU);
 
-  // is_simple_data_ptr: context==nullptr branch.
+  // PyTorch only treats context==data as a simple DataPtr.
   RawCompatibleAllocator compatible_alloc;
-  ASSERT_TRUE(compatible_alloc.is_simple_data_ptr(dp));
+  ASSERT_FALSE(compatible_alloc.is_simple_data_ptr(dp));
+
+  // is_simple_data_ptr: context==data branch.
+  c10::DataPtr simple = compatible_alloc.allocate(4);
+  ASSERT_TRUE(compatible_alloc.is_simple_data_ptr(simple));
 
   // is_simple_data_ptr: context!=data branch.
   c10::DataPtr non_simple = RawIncompatibleAllocator().allocate(4);
@@ -1041,7 +1046,7 @@ TEST(StorageTest, ReferenceSemanticsSetNbytesVisibleThroughCopy) {
 TEST(StorageTest, CUDAAllocatorZeroBytePreservesDevice) {
   // getCUDADeviceAllocator()->allocate(0) must return a DataPtr whose device
   // is the current CUDA device, not a default-constructed CPU DataPtr.
-  if (!at::cuda::is_available()) {
+  if (!torch::cuda::is_available()) {
     return;  // No CUDA device, skip
   }
 
diff --git a/test/cpp/compat/compat_basic_test.cc b/test/cpp/compat/compat_basic_test.cc
index d6f33869449003..232a9fd66e8f7c 100644
--- a/test/cpp/compat/compat_basic_test.cc
+++ b/test/cpp/compat/compat_basic_test.cc
@@ -231,7 +231,7 @@ TEST(compat_basic_test, BasicCase) {
 TEST(TestDevice, DeviceAPIsOnCUDA) {
   // Test device related APIs on CUDA if available
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (at::cuda::is_available()) {
+  if (torch::cuda::is_available()) {
     at::TensorBase cuda_tensor = at::ones(
         {2, 3}, c10::TensorOptions().dtype(at::kFloat).device(at::kCUDA));