From 2f67c076cecd391fa7e758a4032e2e2ed7e08ec4 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Tue, 24 Mar 2026 22:26:10 +0800
Subject: [PATCH 1/8] [Cpp API Compatibility] remove CUDAStream::raw_stream()

---
 .../api/include/compat/c10/cuda/CUDAStream.h   | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
index f88dd043317252..a6c1e95b61ef0f 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
@@ -74,7 +74,7 @@ inline StreamPoolState& get_pool(int device_index) {
 inline void init_pool(int device_index, StreamPoolState* state) {
   phi::backends::gpu::GPUDeviceGuard guard(device_index);
   int lo_pri = 0, hi_pri = 0;
-  cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri);
+  C10_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
   for (int i = 0; i < kStreamsPerPool; ++i) {
     C10_CUDA_CHECK(cudaStreamCreateWithPriority(
         &state->low_priority[i], cudaStreamNonBlocking, lo_pri));
@@ -134,10 +134,6 @@ class CUDAStream {
 
   Device device() const { return Device(DeviceType::CUDA, device_index()); }
 
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // stream()
-  cudaStream_t raw_stream() const { return stream(); }
-
  private:
   Stream stream_;
 };
@@ -192,6 +188,9 @@ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) {
  * record_stream lifetime semantics.
  */
 inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
+                                    c10::DeviceIndex device_index = -1);
+
+inline CUDAStream getStreamFromPool(const int priority,
                                     c10::DeviceIndex device_index = -1) {
   if (device_index == -1) {
     device_index = phi::backends::gpu::GetCurrentDeviceId();
@@ -203,7 +202,9 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
   });
 
   cudaStream_t raw;
-  if (isHighPriority) {
+  // Keep parity with PyTorch API shape: negative priority selects the
+  // high-priority pool, non-negative selects the low-priority pool.
+  if (priority < 0) {
     raw = state.high_priority[state.hp_counter.fetch_add(1) %
                               detail::kStreamsPerPool];
   } else {
@@ -213,6 +214,11 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
   return make_cuda_stream(raw, device_index);
 }
 
+inline CUDAStream getStreamFromPool(const bool isHighPriority,
+                                    c10::DeviceIndex device_index) {
+  return getStreamFromPool(isHighPriority ? -1 : 0, device_index);
+}
+
 /**
  * Set the current CUDA stream for the device of the given stream in the
  * calling thread.

From 2e43d1f1331fea104c1071516ba575189e3743b8 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Wed, 25 Mar 2026 19:51:27 +0800
Subject: [PATCH 2/8] remove some other APIs

---
 .../phi/api/include/compat/ATen/core/TensorBody.h   |  6 ------
 .../phi/api/include/compat/ATen/ops/record_stream.h | 13 -------------
 paddle/phi/api/include/compat/c10/core/Event.h      |  6 ------
 3 files changed, 25 deletions(-)

diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
index a12dfba80b5bec..6445befda15ee5 100644
--- a/paddle/phi/api/include/compat/ATen/core/TensorBody.h
+++ b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
@@ -687,12 +687,6 @@ class Tensor : public TensorBase {
   }
 
   void record_stream(at::Stream s) const;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void record_stream(at::cuda::CUDAStream s) const;
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // at::Stream
-  void record_stream(cudaStream_t s) const;
-#endif
 
   Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); }
 
diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
index 73cb5dd4b2247c..ea6a21d98140f5 100644
--- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h
+++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
@@ -54,17 +54,4 @@ inline void Tensor::record_stream(at::cuda::CUDAStream s) const {
   record_stream(static_cast<at::Stream>(s));
 }
 
-// TODO(youge325): Remove after DeepEP paddle branch is updated to use
-// at::Stream
-inline void Tensor::record_stream(cudaStream_t s) const {
-  auto dense_tensor =
-      std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
-  PD_CHECK(dense_tensor != nullptr,
-           "record_stream only supports DenseTensor, but got a non-dense "
-           "tensor implementation.");
-  PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
-           "record_stream is not supported for CPU tensors.");
-  paddle::memory::RecordStream(dense_tensor->Holder(), s);
-}
-#endif
 }  // namespace at
diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
index 5d2c2d10b710d5..2c58fe97587245 100644
--- a/paddle/phi/api/include/compat/c10/core/Event.h
+++ b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -136,12 +136,6 @@ struct Event final {
 
   void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); }
 
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // c10::Stream
-  void record(const cudaStream_t &stream) {
-    C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream));
-  }
-
   void block(const Stream &stream) const {
     C10_CUDA_CHECK(cudaStreamWaitEvent(
         static_cast<cudaStream_t>(stream.native_handle()), cuda_event_, 0));

From e6d3638ae6c257b7de75d28700ddabdbf4ba5a3f Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Wed, 25 Mar 2026 20:01:22 +0800
Subject: [PATCH 3/8] remove useless annotation

---
 .../phi/api/include/compat/c10/core/Event.h   | 32 -------------------
 1 file changed, 32 deletions(-)

diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
index 2c58fe97587245..04ea3c7eaae0f1 100644
--- a/paddle/phi/api/include/compat/c10/core/Event.h
+++ b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -20,38 +20,6 @@
 #include <queue>
 namespace c10 {
 
-/**
- * A backend-generic movable, not copyable, not thread-safe event.
- *
- * The design of this event follows that of CUDA and HIP events. These events
- * are recorded and waited on by streams and can be rerecorded to,
- * each rerecording essentially creating a new version of the event.
- * For example, if (in CPU time), stream X is asked to record E,
- * stream Y waits on E, and stream X is asked to record E again, then Y will
- * wait for X to finish the first call to record and not the second, because
- * it's waiting on the first version of event E, not the second.
- * Querying an event only returns the status of its most recent version.
- *
- * Backend-generic events are implemented by this class and
- * impl::InlineEvent. In addition to these events there are also
- * some backend-specific events, like ATen's CUDAEvent. Each of these
- * classes has its own use.
- *
- * impl::InlineEvent<...> or a backend-specific event should be
- * preferred when the backend is known at compile time and known to
- * be compiled. Backend-specific events may have additional functionality.
- *
- * This Event should be used if a particular backend may not be available,
- * or the backend required is not known at compile time.
- *
- * These generic events are built on top of DeviceGuardImpls, analogous
- * to DeviceGuard and InlineDeviceGuard. The name "DeviceGuardImpls,"
- * is no longer entirely accurate, as these classes implement the
- * backend-specific logic for a generic backend interface.
- *
- * See DeviceGuardImplInterface.h for a list of all supported flags.
- */
-
 #ifdef PADDLE_WITH_CUDA
 
 class EventPool {

From 91428a612ff98e5d4b65fabffa6a4d16719cedf9 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Wed, 25 Mar 2026 20:24:52 +0800
Subject: [PATCH 4/8] using alias for c10::Stream

---
 paddle/phi/api/include/compat/c10/core/Stream.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h
index 58912130daf303..e000ca9ac13650 100644
--- a/paddle/phi/api/include/compat/c10/core/Stream.h
+++ b/paddle/phi/api/include/compat/c10/core/Stream.h
@@ -106,3 +106,7 @@ struct hash<c10::Stream> {
   }
 };
 }  // namespace std
+
+namespace at {
+using c10::Stream;
+}

From dc7610a09245716a88b4db070a63c1d3d4681a97 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Wed, 25 Mar 2026 20:31:29 +0800
Subject: [PATCH 5/8] fix compiling error

---
 paddle/phi/api/include/compat/ATen/ops/record_stream.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
index ea6a21d98140f5..ff43391cdc796c 100644
--- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h
+++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
@@ -49,9 +49,4 @@ inline void Tensor::record_stream(at::Stream s) const {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-inline void Tensor::record_stream(at::cuda::CUDAStream s) const {
-  record_stream(static_cast<at::Stream>(s));
-}
-
 }  // namespace at

From 7b70cb059e0762aa709beebc590287b1c83a90be Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Wed, 1 Apr 2026 16:18:57 +0800
Subject: [PATCH 6/8] align Event related APIs

---
 .../phi/api/include/compat/c10/core/Event.h   | 114 ++++++++++++++++--
 1 file changed, 104 insertions(+), 10 deletions(-)

diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
index 04ea3c7eaae0f1..b880c28122705e 100644
--- a/paddle/phi/api/include/compat/c10/core/Event.h
+++ b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -16,10 +16,17 @@
 
 #include <c10/core/Device.h>
 #include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+
+#ifdef PADDLE_WITH_CUDA
 #include <c10/cuda/CUDAStream.h>
 #include <queue>
+#endif
+
 namespace c10 {
 
+enum class EventFlag { PYTORCH_DEFAULT, BACKEND_DEFAULT, INVALID };
+
 #ifdef PADDLE_WITH_CUDA
 
 class EventPool {
@@ -90,29 +97,118 @@ cudaEvent_t EventPool::CreateCudaEventFromPool() {
   return CreateNewOrReuseEvent();
 }
 
+#endif  // PADDLE_WITH_CUDA
+
 struct Event final {
  public:
-  Event(const DeviceType &type) {
-    // device_type is useless, only for compatibility
-    cuda_event_ = EventPool::Instance().CreateCudaEventFromPool();
+  Event() = delete;
+  Event(const DeviceType device_type,
+        const EventFlag flag = EventFlag::PYTORCH_DEFAULT)
+      : device_type_(device_type), flag_(flag) {
+#ifdef PADDLE_WITH_CUDA
+    if (device_type == DeviceType::CUDA) {
+      cuda_event_ = EventPool::Instance().CreateCudaEventFromPool();
+    }
+#endif
+  }
+
+  Event(const Event &) = delete;
+  Event &operator=(const Event &) = delete;
+  Event(Event &&) = default;
+  Event &operator=(Event &&) = default;
+  ~Event() = default;
+
+  Device device() const noexcept { return Device(device_type_, device_index_); }
+  DeviceType device_type() const noexcept { return device_type_; }
+  DeviceIndex device_index() const noexcept { return device_index_; }
+  EventFlag flag() const noexcept { return flag_; }
+  bool was_marked_for_recording() const noexcept {
+    return was_marked_for_recording_;
+  }
+
+  void recordOnce(const Stream &stream) {
+    if (!was_marked_for_recording_) record(stream);
   }
 
   void record(const Stream &stream) {
-    C10_CUDA_CHECK(cudaEventRecord(
-        cuda_event_, static_cast<cudaStream_t>(stream.native_handle())));
+    TORCH_CHECK(
+        stream.device_type() == device_type_,
+        "Event device type does not match recording stream's device type.");
+#ifdef PADDLE_WITH_CUDA
+    if (device_type_ == DeviceType::CUDA && cuda_event_) {
+      C10_CUDA_CHECK(cudaEventRecord(
+          cuda_event_, static_cast<cudaStream_t>(stream.native_handle())));
+      was_marked_for_recording_ = true;
+      device_index_ = stream.device_index();
+      return;
+    }
+#endif
+    TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
+#ifdef PADDLE_WITH_CUDA
   void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); }
+#endif
 
   void block(const Stream &stream) const {
-    C10_CUDA_CHECK(cudaStreamWaitEvent(
-        static_cast<cudaStream_t>(stream.native_handle()), cuda_event_, 0));
+    if (!was_marked_for_recording_) return;
+    TORCH_CHECK(
+        stream.device_type() == device_type_,
+        "Event device type does not match blocking stream's device type.");
+#ifdef PADDLE_WITH_CUDA
+    if (device_type_ == DeviceType::CUDA && cuda_event_) {
+      C10_CUDA_CHECK(cudaStreamWaitEvent(
+          static_cast<cudaStream_t>(stream.native_handle()), cuda_event_, 0));
+      return;
+    }
+#endif
+    TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
+  bool query() const {
+    if (!was_marked_for_recording_) return true;
+#ifdef PADDLE_WITH_CUDA
+    if (device_type_ == DeviceType::CUDA && cuda_event_) {
+      return cudaEventQuery(cuda_event_) == cudaSuccess;
+    }
+#endif
+    TORCH_CHECK(false, "Backend doesn't support events.");
+    return true;
+  }
+
+  double elapsedTime(const Event &event) const {
+    (void)event;
+    return 0.0;
+  }
+
+  void *eventId() const {
+#ifdef PADDLE_WITH_CUDA
+    return cuda_event_;
+#else
+    return nullptr;
+#endif
+  }
+
+  void synchronize() const {
+#ifdef PADDLE_WITH_CUDA
+    if (device_type_ == DeviceType::CUDA && cuda_event_) {
+      C10_CUDA_CHECK(cudaEventSynchronize(cuda_event_));
+    }
+#endif
+  }
+
+#ifdef PADDLE_WITH_CUDA
   cudaEvent_t cuda_event() const { return cuda_event_; }
+#endif
 
  private:
-  cudaEvent_t cuda_event_;
+  DeviceType device_type_;
+  DeviceIndex device_index_ = -1;
+  EventFlag flag_ = EventFlag::PYTORCH_DEFAULT;
+  bool was_marked_for_recording_ = false;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t cuda_event_ = nullptr;
+#endif
 };
 
 }  // namespace c10
@@ -120,5 +216,3 @@ struct Event final {
 namespace torch {
 using c10::Event;
 }  // namespace torch
-
-#endif

From ce69b67d7d0750e0a32bd8741f941b1ff37afb5d Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Thu, 2 Apr 2026 15:28:44 +0800
Subject: [PATCH 7/8] revert deleted stream APIs and fix Event

---
 .../api/include/compat/ATen/core/TensorBody.h |   6 +
 .../include/compat/ATen/ops/record_stream.h   |  18 ++
 .../phi/api/include/compat/c10/core/Event.h   | 273 +++++++++++-------
 .../phi/api/include/compat/c10/core/Stream.h  |   4 -
 .../api/include/compat/c10/cuda/CUDAStream.h  |   4 +
 test/cpp/compat/ATen_record_stream_test.cc    |  14 +
 test/cpp/compat/CMakeLists.txt                |   1 +
 test/cpp/compat/c10_Event_test.cc             | 118 ++++++++
 8 files changed, 329 insertions(+), 109 deletions(-)
 create mode 100644 test/cpp/compat/c10_Event_test.cc

diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
index 6445befda15ee5..a12dfba80b5bec 100644
--- a/paddle/phi/api/include/compat/ATen/core/TensorBody.h
+++ b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
@@ -687,6 +687,12 @@ class Tensor : public TensorBase {
   }
 
   void record_stream(at::Stream s) const;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void record_stream(at::cuda::CUDAStream s) const;
+  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
+  // at::Stream
+  void record_stream(cudaStream_t s) const;
+#endif
 
   Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); }
 
diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
index ff43391cdc796c..73cb5dd4b2247c 100644
--- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h
+++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
@@ -49,4 +49,22 @@ inline void Tensor::record_stream(at::Stream s) const {
 #endif
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+inline void Tensor::record_stream(at::cuda::CUDAStream s) const {
+  record_stream(static_cast<at::Stream>(s));
+}
+
+// TODO(youge325): Remove after DeepEP paddle branch is updated to use
+// at::Stream
+inline void Tensor::record_stream(cudaStream_t s) const {
+  auto dense_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
+  PD_CHECK(dense_tensor != nullptr,
+           "record_stream only supports DenseTensor, but got a non-dense "
+           "tensor implementation.");
+  PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
+           "record_stream is not supported for CPU tensors.");
+  paddle::memory::RecordStream(dense_tensor->Holder(), s);
+}
+#endif
 }  // namespace at
diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
index b880c28122705e..00919c7f13e300 100644
--- a/paddle/phi/api/include/compat/c10/core/Event.h
+++ b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -18,105 +18,37 @@
 #include <c10/core/DeviceType.h>
 #include <c10/core/Stream.h>
 
+#include <utility>
+
 #ifdef PADDLE_WITH_CUDA
+#include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
-#include <queue>
 #endif
 
 namespace c10 {
 
 enum class EventFlag { PYTORCH_DEFAULT, BACKEND_DEFAULT, INVALID };
 
-#ifdef PADDLE_WITH_CUDA
-
-class EventPool {
- public:
-  EventPool();
-  EventPool(const EventPool &) = delete;
-  EventPool(EventPool &&) = delete;
-  ~EventPool();
-
-  cudaEvent_t CreateCudaEventFromPool();
-
-  static EventPool &Instance();
-
- private:
-  std::queue<cudaEvent_t> incomplished_events_;
-  std::mutex mtx_;
-};
-
-EventPool &EventPool::Instance() {
-  static EventPool pool;
-  return pool;
-}
-
-EventPool::EventPool() {
-  for (size_t i = 0; i < 1000; ++i) {
-    cudaEvent_t new_event;
-    C10_CUDA_CHECK(cudaEventCreate(&new_event));
-
-    cudaEventRecord(new_event, 0);
-    incomplished_events_.push(new_event);
-  }
-}
-
-EventPool::~EventPool() {
-  std::unique_lock<std::mutex> lock(mtx_);
-  while (!incomplished_events_.empty()) {
-    cudaEvent_t event = incomplished_events_.front();
-    incomplished_events_.pop();
-    if (cudaEventQuery(event) == cudaSuccess) {
-      C10_CUDA_CHECK(cudaEventDestroy(event));
-    }
-  }
-}
-
-cudaEvent_t EventPool::CreateCudaEventFromPool() {
-  std::unique_lock<std::mutex> lock(mtx_);
-
-  const auto &CreateNewEvent = [&]() -> cudaEvent_t {
-    cudaEvent_t new_event;
-    C10_CUDA_CHECK(cudaEventCreate(&new_event));
-    incomplished_events_.push(new_event);
-    return new_event;
-  };
-
-  const auto &CreateNewOrReuseEvent = [&]() -> cudaEvent_t {
-    cudaEvent_t front_event = incomplished_events_.front();
-    incomplished_events_.pop();
-    incomplished_events_.push(front_event);
-    if (cudaEventQuery(front_event) == cudaSuccess) {
-      return front_event;
-    }
-    return CreateNewEvent();
-  };
-
-  if (incomplished_events_.empty()) {
-    return CreateNewEvent();
-  }
-  return CreateNewOrReuseEvent();
-}
-
-#endif  // PADDLE_WITH_CUDA
-
 struct Event final {
  public:
   Event() = delete;
   Event(const DeviceType device_type,
         const EventFlag flag = EventFlag::PYTORCH_DEFAULT)
-      : device_type_(device_type), flag_(flag) {
-#ifdef PADDLE_WITH_CUDA
-    if (device_type == DeviceType::CUDA) {
-      cuda_event_ = EventPool::Instance().CreateCudaEventFromPool();
+      : device_type_(device_type), flag_(flag) {}
+
+  Event(const Event&) = delete;
+  Event& operator=(const Event&) = delete;
+
+  Event(Event&& other) noexcept { MoveFrom(std::move(other)); }
+  Event& operator=(Event&& other) noexcept {
+    if (this != &other) {
+      DestroyCudaEvent();
+      MoveFrom(std::move(other));
     }
-#endif
+    return *this;
   }
 
-  Event(const Event &) = delete;
-  Event &operator=(const Event &) = delete;
-  Event(Event &&) = default;
-  Event &operator=(Event &&) = default;
-  ~Event() = default;
+  ~Event() { DestroyCudaEvent(); }
 
   Device device() const noexcept { return Device(device_type_, device_index_); }
   DeviceType device_type() const noexcept { return device_type_; }
@@ -126,20 +58,23 @@ struct Event final {
     return was_marked_for_recording_;
   }
 
-  void recordOnce(const Stream &stream) {
-    if (!was_marked_for_recording_) record(stream);
+  void recordOnce(const Stream& stream) {
+    if (!was_marked_for_recording_) {
+      record(stream);
+    }
   }
 
-  void record(const Stream &stream) {
-    TORCH_CHECK(
-        stream.device_type() == device_type_,
-        "Event device type does not match recording stream's device type.");
+  void record(const Stream& stream) {
+    TORCH_CHECK(stream.device_type() == device_type_,
+                "Event device type ",
+                device_type_,
+                " does not match recording stream's device type ",
+                stream.device_type(),
+                ".");
 #ifdef PADDLE_WITH_CUDA
-    if (device_type_ == DeviceType::CUDA && cuda_event_) {
-      C10_CUDA_CHECK(cudaEventRecord(
-          cuda_event_, static_cast<cudaStream_t>(stream.native_handle())));
-      was_marked_for_recording_ = true;
-      device_index_ = stream.device_index();
+    if (device_type_ == DeviceType::CUDA) {
+      RecordCudaEvent(static_cast<cudaStream_t>(stream.native_handle()),
+                      stream.device_index());
       return;
     }
 #endif
@@ -147,16 +82,37 @@ struct Event final {
   }
 
 #ifdef PADDLE_WITH_CUDA
-  void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); }
-#endif
+  void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); }
 
-  void block(const Stream &stream) const {
-    if (!was_marked_for_recording_) return;
+  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
+  // c10::Stream
+  void record(const cudaStream_t& stream) {
     TORCH_CHECK(
-        stream.device_type() == device_type_,
-        "Event device type does not match blocking stream's device type.");
+        device_type_ == DeviceType::CUDA,
+        "Raw cudaStream_t recording is only supported for CUDA events.");
+    RecordCudaEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
+  }
+#endif
+
+  void block(const Stream& stream) const {
+    if (!was_marked_for_recording_) {
+      return;
+    }
+    TORCH_CHECK(stream.device_type() == device_type_,
+                "Event device type ",
+                device_type_,
+                " does not match blocking stream's device type ",
+                stream.device_type(),
+                ".");
 #ifdef PADDLE_WITH_CUDA
     if (device_type_ == DeviceType::CUDA && cuda_event_) {
+      TORCH_CHECK(device_index_ == stream.device_index(),
+                  "Event device index ",
+                  static_cast<int>(device_index_),
+                  " does not match blocking stream's device index ",
+                  static_cast<int>(stream.device_index()),
+                  ".");
+      c10::cuda::CUDAGuard guard(device_index_);
       C10_CUDA_CHECK(cudaStreamWaitEvent(
           static_cast<cudaStream_t>(stream.native_handle()), cuda_event_, 0));
       return;
@@ -166,22 +122,64 @@ struct Event final {
   }
 
   bool query() const {
-    if (!was_marked_for_recording_) return true;
+    if (!was_marked_for_recording_) {
+      return true;
+    }
 #ifdef PADDLE_WITH_CUDA
     if (device_type_ == DeviceType::CUDA && cuda_event_) {
-      return cudaEventQuery(cuda_event_) == cudaSuccess;
+      const auto err = cudaEventQuery(cuda_event_);
+      if (err == cudaSuccess) {
+        return true;
+      }
+      if (err != cudaErrorNotReady) {
+        C10_CUDA_CHECK(err);
+      } else {
+        (void)cudaGetLastError();
+      }
+      return false;
     }
 #endif
     TORCH_CHECK(false, "Backend doesn't support events.");
     return true;
   }
 
-  double elapsedTime(const Event &event) const {
-    (void)event;
+  double elapsedTime(const Event& event) const {
+    TORCH_CHECK(event.device_type() == device_type_,
+                "Event device type ",
+                device_type_,
+                " does not match other's device type ",
+                event.device_type(),
+                ".");
+    TORCH_CHECK(
+        flag_ == EventFlag::BACKEND_DEFAULT &&
+            event.flag_ == EventFlag::BACKEND_DEFAULT,
+        "Both events must be created with argument 'enable_timing=True'.");
+    TORCH_CHECK(
+        was_marked_for_recording_ && event.was_marked_for_recording_,
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && event.query(),
+        "Both events must be completed before calculating elapsed time.");
+#ifdef PADDLE_WITH_CUDA
+    if (device_type_ == DeviceType::CUDA && cuda_event_ && event.cuda_event_) {
+      TORCH_CHECK(device_index_ == event.device_index_,
+                  "Event device index ",
+                  static_cast<int>(device_index_),
+                  " does not match other's device index ",
+                  static_cast<int>(event.device_index_),
+                  ".");
+      c10::cuda::CUDAGuard guard(device_index_);
+      float time_ms = 0.0f;
+      C10_CUDA_CHECK(
+          cudaEventElapsedTime(&time_ms, cuda_event_, event.cuda_event_));
+      return static_cast<double>(time_ms);
+    }
+#endif
+    TORCH_CHECK(false, "Backend doesn't support event elapsedTime.");
     return 0.0;
   }
 
-  void *eventId() const {
+  void* eventId() const {
 #ifdef PADDLE_WITH_CUDA
     return cuda_event_;
 #else
@@ -190,11 +188,16 @@ struct Event final {
   }
 
   void synchronize() const {
+    if (!was_marked_for_recording_) {
+      return;
+    }
 #ifdef PADDLE_WITH_CUDA
     if (device_type_ == DeviceType::CUDA && cuda_event_) {
       C10_CUDA_CHECK(cudaEventSynchronize(cuda_event_));
+      return;
     }
 #endif
+    TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
 #ifdef PADDLE_WITH_CUDA
@@ -208,7 +211,67 @@ struct Event final {
   bool was_marked_for_recording_ = false;
 #ifdef PADDLE_WITH_CUDA
   cudaEvent_t cuda_event_ = nullptr;
+
+  static unsigned int CudaEventCreateFlags(EventFlag flag) {
+    switch (flag) {
+      case EventFlag::PYTORCH_DEFAULT:
+        return cudaEventDisableTiming;
+      case EventFlag::BACKEND_DEFAULT:
+        return cudaEventDefault;
+      default:
+        TORCH_CHECK(false, "CUDA event received unknown flag");
+    }
+  }
+
+  void EnsureCudaEventCreated(DeviceIndex stream_device_index) {
+    if (cuda_event_) {
+      return;
+    }
+    c10::cuda::CUDAGuard guard(stream_device_index);
+    C10_CUDA_CHECK(
+        cudaEventCreateWithFlags(&cuda_event_, CudaEventCreateFlags(flag_)));
+  }
+
+  void RecordCudaEvent(cudaStream_t stream, DeviceIndex stream_device_index) {
+    TORCH_CHECK(device_index_ == -1 || device_index_ == stream_device_index,
+                "Event device index ",
+                static_cast<int>(device_index_),
+                " does not match recording stream's device index ",
+                static_cast<int>(stream_device_index),
+                ".");
+    EnsureCudaEventCreated(stream_device_index);
+    c10::cuda::CUDAGuard guard(stream_device_index);
+    C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream));
+    device_index_ = stream_device_index;
+    was_marked_for_recording_ = true;
+  }
+
+  void DestroyCudaEvent() noexcept {
+    if (!cuda_event_) {
+      return;
+    }
+    try {
+      c10::cuda::CUDAGuard guard(device_index_);
+      C10_CUDA_CHECK(cudaEventDestroy(cuda_event_));
+    } catch (...) {
+    }
+    cuda_event_ = nullptr;
+  }
+#else
+  void DestroyCudaEvent() noexcept {}
 #endif
+
+  void MoveFrom(Event&& other) noexcept {
+    device_type_ = other.device_type_;
+    device_index_ = other.device_index_;
+    flag_ = other.flag_;
+    was_marked_for_recording_ = other.was_marked_for_recording_;
+#ifdef PADDLE_WITH_CUDA
+    cuda_event_ = std::exchange(other.cuda_event_, nullptr);
+#endif
+    other.device_index_ = -1;
+    other.was_marked_for_recording_ = false;
+  }
 };
 
 }  // namespace c10
diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h
index e000ca9ac13650..58912130daf303 100644
--- a/paddle/phi/api/include/compat/c10/core/Stream.h
+++ b/paddle/phi/api/include/compat/c10/core/Stream.h
@@ -106,7 +106,3 @@ struct hash<c10::Stream> {
   }
 };
 }  // namespace std
-
-namespace at {
-using c10::Stream;
-}
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
index a6c1e95b61ef0f..364a634f9abecf 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
@@ -134,6 +134,10 @@ class CUDAStream {
 
   Device device() const { return Device(DeviceType::CUDA, device_index()); }
 
+  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
+  // stream()
+  cudaStream_t raw_stream() const { return stream(); }
+
  private:
   Stream stream_;
 };
diff --git a/test/cpp/compat/ATen_record_stream_test.cc b/test/cpp/compat/ATen_record_stream_test.cc
index db895005714d0e..8be51d243d1022 100644
--- a/test/cpp/compat/ATen_record_stream_test.cc
+++ b/test/cpp/compat/ATen_record_stream_test.cc
@@ -47,6 +47,14 @@ class RecordStreamTest : public ::testing::Test {
 
 // --- Happy path: CUDA tensor + current CUDA stream should succeed ---
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const;
+[[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method =
+    &at::Tensor::record_stream;
+
+using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const;
+[[maybe_unused]] static RecordRawCudaStreamMethod
+    g_record_raw_cuda_stream_method = &at::Tensor::record_stream;
+
 TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) {
   SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   auto stream = at::cuda::getCurrentCUDAStream();
@@ -60,6 +68,12 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) {
   c10::Stream default_stream = c10::cuda::getDefaultCUDAStream().unwrap();
   EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream));
 }
+
+TEST_F(RecordStreamTest, CudaTensorRawCudaStream) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream()));
+}
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
 
 // --- Error path: CPU tensor + CPU stream (record_stream does not support CPU
diff --git a/test/cpp/compat/CMakeLists.txt b/test/cpp/compat/CMakeLists.txt
index 1e02c599f031f6..e771d03fb431ef 100644
--- a/test/cpp/compat/CMakeLists.txt
+++ b/test/cpp/compat/CMakeLists.txt
@@ -22,6 +22,7 @@ if(NOT WIN32)
     nv_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc)
     nv_test(c10_storage_test SRCS c10_storage_test.cc)
     nv_test(c10_Stream_test SRCS c10_Stream_test.cc)
+    nv_test(c10_Event_test SRCS c10_Event_test.cc)
     nv_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc)
     nv_test(c10_layout_test SRCS c10_layout_test.cc)
     nv_test(ATen_clamp_test SRCS ATen_clamp_test.cc)
diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc
new file mode 100644
index 00000000000000..9b350a96280dbd
--- /dev/null
+++ b/test/cpp/compat/c10_Event_test.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <c10/core/Event.h>
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
+#include "gtest/gtest.h"
+#include "test/cpp/compat/cuda_test_utils.h"
+
+TEST(EventTest, CpuEventDefaultProperties) {
+  c10::Event event(c10::DeviceType::CPU);
+  EXPECT_EQ(event.device_type(), c10::DeviceType::CPU);
+  EXPECT_EQ(event.device_index(), -1);
+  EXPECT_EQ(event.flag(), c10::EventFlag::PYTORCH_DEFAULT);
+  EXPECT_FALSE(event.was_marked_for_recording());
+  EXPECT_TRUE(event.query());
+  EXPECT_EQ(event.eventId(), nullptr);
+}
+
+TEST(EventTest, CpuEventRecordThrows) {
+  c10::Event event(c10::DeviceType::CPU);
+  c10::Stream stream(c10::Stream::DEFAULT,
+                     c10::Device(c10::DeviceType::CPU, 0));
+  EXPECT_THROW(event.record(stream), std::exception);
+  EXPECT_THROW(event.recordOnce(stream), std::exception);
+}
+
+#ifdef PADDLE_WITH_CUDA
+using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&);
+[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method =
+    &c10::Event::record;
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(EventTest, CudaEventLazyCreateAndRecord) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  c10::Event event(c10::DeviceType::CUDA);
+  auto stream = c10::cuda::getCurrentCUDAStream();
+
+  EXPECT_EQ(event.device_index(), -1);
+  EXPECT_EQ(event.eventId(), nullptr);
+  EXPECT_FALSE(event.was_marked_for_recording());
+
+  EXPECT_NO_THROW(event.record(stream.unwrap()));
+  EXPECT_EQ(event.device_index(), stream.device_index());
+  EXPECT_NE(event.eventId(), nullptr);
+  EXPECT_TRUE(event.was_marked_for_recording());
+
+  EXPECT_NO_THROW(event.synchronize());
+  EXPECT_TRUE(event.query());
+}
+
+TEST(EventTest, CudaEventElapsedTimeRequiresTimingFlag) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto stream = c10::cuda::getCurrentCUDAStream();
+  c10::Event start(c10::DeviceType::CUDA);
+  c10::Event end(c10::DeviceType::CUDA);
+
+  start.record(stream.unwrap());
+  end.record(stream.unwrap());
+  end.synchronize();
+
+  EXPECT_THROW(start.elapsedTime(end), std::exception);
+}
+
+TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto stream = c10::cuda::getCurrentCUDAStream();
+  c10::Event start(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT);
+  c10::Event end(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT);
+
+  start.record(stream.unwrap());
+  end.record(stream.unwrap());
+  end.synchronize();
+
+  double elapsed_ms = -1.0;
+  EXPECT_NO_THROW(elapsed_ms = start.elapsedTime(end));
+  EXPECT_GE(elapsed_ms, 0.0);
+}
+
+TEST(EventTest, CudaEventRawStreamRecordCompatibility) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto stream = c10::cuda::getCurrentCUDAStream();
+  c10::Event event(c10::DeviceType::CUDA);
+  EXPECT_NO_THROW(event.record(stream.raw_stream()));
+  EXPECT_EQ(event.device_index(), stream.device_index());
+  EXPECT_TRUE(event.was_marked_for_recording());
+}
+
+TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  if (c10::cuda::device_count() < 2) {
+    return;
+  }
+
+  c10::Event event(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT);
+  auto stream0 = c10::cuda::getDefaultCUDAStream(0);
+  auto stream1 = c10::cuda::getDefaultCUDAStream(1);
+
+  EXPECT_NO_THROW(event.record(stream0.unwrap()));
+  EXPECT_THROW(event.record(stream1.unwrap()), std::exception);
+}
+#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP

From 852d525ac182537aa60ea9b098f1edacb49f51d6 Mon Sep 17 00:00:00 2001
From: youge325 <A372707325@126.com>
Date: Thu, 2 Apr 2026 16:28:06 +0800
Subject: [PATCH 8/8] complement macro

---
 .../phi/api/include/compat/c10/core/Event.h   | 145 +++++++++++++-----
 test/cpp/compat/c10_Event_test.cc             |  16 +-
 2 files changed, 114 insertions(+), 47 deletions(-)

diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
index 00919c7f13e300..672fb7f496f907 100644
--- a/paddle/phi/api/include/compat/c10/core/Event.h
+++ b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -20,7 +20,7 @@
 
 #include <utility>
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 #endif
@@ -31,6 +31,16 @@ enum class EventFlag { PYTORCH_DEFAULT, BACKEND_DEFAULT, INVALID };
 
 struct Event final {
  public:
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
+  using BackendEvent = hipEvent_t;
+  using BackendStream = hipStream_t;
+#else
+  using BackendEvent = cudaEvent_t;
+  using BackendStream = cudaStream_t;
+#endif
+#endif
+
   Event() = delete;
   Event(const DeviceType device_type,
         const EventFlag flag = EventFlag::PYTORCH_DEFAULT)
@@ -42,13 +52,13 @@ struct Event final {
   Event(Event&& other) noexcept { MoveFrom(std::move(other)); }
   Event& operator=(Event&& other) noexcept {
     if (this != &other) {
-      DestroyCudaEvent();
+      DestroyBackendEvent();
       MoveFrom(std::move(other));
     }
     return *this;
   }
 
-  ~Event() { DestroyCudaEvent(); }
+  ~Event() { DestroyBackendEvent(); }
 
   Device device() const noexcept { return Device(device_type_, device_index_); }
   DeviceType device_type() const noexcept { return device_type_; }
@@ -71,26 +81,28 @@ struct Event final {
                 " does not match recording stream's device type ",
                 stream.device_type(),
                 ".");
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (device_type_ == DeviceType::CUDA) {
-      RecordCudaEvent(static_cast<cudaStream_t>(stream.native_handle()),
-                      stream.device_index());
+      RecordBackendEvent(static_cast<BackendStream>(stream.native_handle()),
+                         stream.device_index());
       return;
     }
 #endif
     TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); }
+#endif
 
+#ifdef PADDLE_WITH_CUDA
   // TODO(youge325): Remove after DeepEP paddle branch is updated to use
   // c10::Stream
   void record(const cudaStream_t& stream) {
     TORCH_CHECK(
         device_type_ == DeviceType::CUDA,
         "Raw cudaStream_t recording is only supported for CUDA events.");
-    RecordCudaEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
+    RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
   }
 #endif
 
@@ -104,8 +116,8 @@ struct Event final {
                 " does not match blocking stream's device type ",
                 stream.device_type(),
                 ".");
-#ifdef PADDLE_WITH_CUDA
-    if (device_type_ == DeviceType::CUDA && cuda_event_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (device_type_ == DeviceType::CUDA && backend_event_) {
       TORCH_CHECK(device_index_ == stream.device_index(),
                   "Event device index ",
                   static_cast<int>(device_index_),
@@ -113,8 +125,17 @@ struct Event final {
                   static_cast<int>(stream.device_index()),
                   ".");
       c10::cuda::CUDAGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipStreamWaitEvent(static_cast<BackendStream>(stream.native_handle()),
+                             backend_event_,
+                             0));
+#else
       C10_CUDA_CHECK(cudaStreamWaitEvent(
-          static_cast<cudaStream_t>(stream.native_handle()), cuda_event_, 0));
+          static_cast<BackendStream>(stream.native_handle()),
+          backend_event_,
+          0));
+#endif
       return;
     }
 #endif
@@ -125,9 +146,20 @@ struct Event final {
     if (!was_marked_for_recording_) {
       return true;
     }
-#ifdef PADDLE_WITH_CUDA
-    if (device_type_ == DeviceType::CUDA && cuda_event_) {
-      const auto err = cudaEventQuery(cuda_event_);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (device_type_ == DeviceType::CUDA && backend_event_) {
+#ifdef PADDLE_WITH_HIP
+      const auto err = hipEventQuery(backend_event_);
+      if (err == hipSuccess) {
+        return true;
+      }
+      if (err != hipErrorNotReady) {
+        PADDLE_ENFORCE_GPU_SUCCESS(err);
+      } else {
+        (void)hipGetLastError();
+      }
+#else
+      const auto err = cudaEventQuery(backend_event_);
       if (err == cudaSuccess) {
         return true;
       }
@@ -136,6 +168,7 @@ struct Event final {
       } else {
         (void)cudaGetLastError();
       }
+#endif
       return false;
     }
 #endif
@@ -160,8 +193,9 @@ struct Event final {
     TORCH_CHECK(
         query() && event.query(),
         "Both events must be completed before calculating elapsed time.");
-#ifdef PADDLE_WITH_CUDA
-    if (device_type_ == DeviceType::CUDA && cuda_event_ && event.cuda_event_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (device_type_ == DeviceType::CUDA && backend_event_ &&
+        event.backend_event_) {
       TORCH_CHECK(device_index_ == event.device_index_,
                   "Event device index ",
                   static_cast<int>(device_index_),
@@ -170,8 +204,13 @@ struct Event final {
                   ".");
       c10::cuda::CUDAGuard guard(device_index_);
       float time_ms = 0.0f;
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipEventElapsedTime(&time_ms, backend_event_, event.backend_event_));
+#else
       C10_CUDA_CHECK(
-          cudaEventElapsedTime(&time_ms, cuda_event_, event.cuda_event_));
+          cudaEventElapsedTime(&time_ms, backend_event_, event.backend_event_));
+#endif
       return static_cast<double>(time_ms);
     }
 #endif
@@ -180,8 +219,8 @@ struct Event final {
   }
 
   void* eventId() const {
-#ifdef PADDLE_WITH_CUDA
-    return cuda_event_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    return backend_event_;
 #else
     return nullptr;
 #endif
@@ -191,9 +230,13 @@ struct Event final {
     if (!was_marked_for_recording_) {
       return;
     }
-#ifdef PADDLE_WITH_CUDA
-    if (device_type_ == DeviceType::CUDA && cuda_event_) {
-      C10_CUDA_CHECK(cudaEventSynchronize(cuda_event_));
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (device_type_ == DeviceType::CUDA && backend_event_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(backend_event_));
+#else
+      C10_CUDA_CHECK(cudaEventSynchronize(backend_event_));
+#endif
       return;
     }
 #endif
@@ -201,7 +244,7 @@ struct Event final {
   }
 
 #ifdef PADDLE_WITH_CUDA
-  cudaEvent_t cuda_event() const { return cuda_event_; }
+  cudaEvent_t cuda_event() const { return backend_event_; }
 #endif
 
  private:
@@ -209,56 +252,78 @@ struct Event final {
   DeviceIndex device_index_ = -1;
   EventFlag flag_ = EventFlag::PYTORCH_DEFAULT;
   bool was_marked_for_recording_ = false;
-#ifdef PADDLE_WITH_CUDA
-  cudaEvent_t cuda_event_ = nullptr;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  BackendEvent backend_event_ = nullptr;
 
-  static unsigned int CudaEventCreateFlags(EventFlag flag) {
+  static unsigned int BackendEventCreateFlags(EventFlag flag) {
     switch (flag) {
       case EventFlag::PYTORCH_DEFAULT:
+#ifdef PADDLE_WITH_HIP
+        return hipEventDisableTiming;
+#else
         return cudaEventDisableTiming;
+#endif
       case EventFlag::BACKEND_DEFAULT:
+#ifdef PADDLE_WITH_HIP
+        return hipEventDefault;
+#else
         return cudaEventDefault;
+#endif
       default:
         TORCH_CHECK(false, "CUDA event received unknown flag");
     }
   }
 
-  void EnsureCudaEventCreated(DeviceIndex stream_device_index) {
-    if (cuda_event_) {
+  void EnsureBackendEventCreated(DeviceIndex stream_device_index) {
+    if (backend_event_) {
       return;
     }
     c10::cuda::CUDAGuard guard(stream_device_index);
-    C10_CUDA_CHECK(
-        cudaEventCreateWithFlags(&cuda_event_, CudaEventCreateFlags(flag_)));
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(
+        &backend_event_, BackendEventCreateFlags(flag_)));
+#else
+    C10_CUDA_CHECK(cudaEventCreateWithFlags(&backend_event_,
+                                            BackendEventCreateFlags(flag_)));
+#endif
   }
 
-  void RecordCudaEvent(cudaStream_t stream, DeviceIndex stream_device_index) {
+  void RecordBackendEvent(BackendStream stream,
+                          DeviceIndex stream_device_index) {
     TORCH_CHECK(device_index_ == -1 || device_index_ == stream_device_index,
                 "Event device index ",
                 static_cast<int>(device_index_),
                 " does not match recording stream's device index ",
                 static_cast<int>(stream_device_index),
                 ".");
-    EnsureCudaEventCreated(stream_device_index);
+    EnsureBackendEventCreated(stream_device_index);
     c10::cuda::CUDAGuard guard(stream_device_index);
-    C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream));
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(backend_event_, stream));
+#else
+    C10_CUDA_CHECK(cudaEventRecord(backend_event_, stream));
+#endif
     device_index_ = stream_device_index;
     was_marked_for_recording_ = true;
   }
 
-  void DestroyCudaEvent() noexcept {
-    if (!cuda_event_) {
+  void DestroyBackendEvent() noexcept {
+    if (!backend_event_) {
       return;
     }
     try {
       c10::cuda::CUDAGuard guard(device_index_);
-      C10_CUDA_CHECK(cudaEventDestroy(cuda_event_));
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(backend_event_));
+#else
+      C10_CUDA_CHECK(cudaEventDestroy(backend_event_));
+#endif
     } catch (...) {
     }
-    cuda_event_ = nullptr;
+    backend_event_ = nullptr;
   }
 #else
-  void DestroyCudaEvent() noexcept {}
+  void DestroyBackendEvent() noexcept {}
 #endif
 
   void MoveFrom(Event&& other) noexcept {
@@ -266,8 +331,8 @@ struct Event final {
     device_index_ = other.device_index_;
     flag_ = other.flag_;
     was_marked_for_recording_ = other.was_marked_for_recording_;
-#ifdef PADDLE_WITH_CUDA
-    cuda_event_ = std::exchange(other.cuda_event_, nullptr);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    backend_event_ = std::exchange(other.backend_event_, nullptr);
 #endif
     other.device_index_ = -1;
     other.was_marked_for_recording_ = false;
diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc
index 9b350a96280dbd..a6933e06efea36 100644
--- a/test/cpp/compat/c10_Event_test.cc
+++ b/test/cpp/compat/c10_Event_test.cc
@@ -56,7 +56,7 @@ TEST(EventTest, CudaEventLazyCreateAndRecord) {
   EXPECT_EQ(event.eventId(), nullptr);
   EXPECT_FALSE(event.was_marked_for_recording());
 
-  EXPECT_NO_THROW(event.record(stream.unwrap()));
+  EXPECT_NO_THROW(event.record(stream));
   EXPECT_EQ(event.device_index(), stream.device_index());
   EXPECT_NE(event.eventId(), nullptr);
   EXPECT_TRUE(event.was_marked_for_recording());
@@ -71,8 +71,8 @@ TEST(EventTest, CudaEventElapsedTimeRequiresTimingFlag) {
   c10::Event start(c10::DeviceType::CUDA);
   c10::Event end(c10::DeviceType::CUDA);
 
-  start.record(stream.unwrap());
-  end.record(stream.unwrap());
+  start.record(stream);
+  end.record(stream);
   end.synchronize();
 
   EXPECT_THROW(start.elapsedTime(end), std::exception);
@@ -84,8 +84,8 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) {
   c10::Event start(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT);
   c10::Event end(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT);
 
-  start.record(stream.unwrap());
-  end.record(stream.unwrap());
+  start.record(stream);
+  end.record(stream);
   end.synchronize();
 
   double elapsed_ms = -1.0;
@@ -93,6 +93,7 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) {
   EXPECT_GE(elapsed_ms, 0.0);
 }
 
+#ifdef PADDLE_WITH_CUDA
 TEST(EventTest, CudaEventRawStreamRecordCompatibility) {
   SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   auto stream = c10::cuda::getCurrentCUDAStream();
@@ -101,6 +102,7 @@ TEST(EventTest, CudaEventRawStreamRecordCompatibility) {
   EXPECT_EQ(event.device_index(), stream.device_index());
   EXPECT_TRUE(event.was_marked_for_recording());
 }
+#endif
 
 TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) {
   SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
@@ -112,7 +114,7 @@ TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) {
   auto stream0 = c10::cuda::getDefaultCUDAStream(0);
   auto stream1 = c10::cuda::getDefaultCUDAStream(1);
 
-  EXPECT_NO_THROW(event.record(stream0.unwrap()));
-  EXPECT_THROW(event.record(stream1.unwrap()), std::exception);
+  EXPECT_NO_THROW(event.record(stream0));
+  EXPECT_THROW(event.record(stream1), std::exception);
 }
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP