diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
index 5d2c2d10b710d5..672fb7f496f907 100644
--- a/paddle/phi/api/include/compat/c10/core/Event.h
+++ b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -16,141 +16,327 @@
 
 #include <c10/core/Device.h>
 #include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+
+#include <utility>
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
-#include <queue>
+#endif
+
 namespace c10 {
 
-/**
- * A backend-generic movable, not copyable, not thread-safe event.
- *
- * The design of this event follows that of CUDA and HIP events. These events
- * are recorded and waited on by streams and can be rerecorded to,
- * each rerecording essentially creating a new version of the event.
- * For example, if (in CPU time), stream X is asked to record E,
- * stream Y waits on E, and stream X is asked to record E again, then Y will
- * wait for X to finish the first call to record and not the second, because
- * it's waiting on the first version of event E, not the second.
- * Querying an event only returns the status of its most recent version.
- *
- * Backend-generic events are implemented by this class and
- * impl::InlineEvent. In addition to these events there are also
- * some backend-specific events, like ATen's CUDAEvent. Each of these
- * classes has its own use.
- *
- * impl::InlineEvent<...> or a backend-specific event should be
- * preferred when the backend is known at compile time and known to
- * be compiled. Backend-specific events may have additional functionality.
- *
- * This Event should be used if a particular backend may not be available,
- * or the backend required is not known at compile time.
- *
- * These generic events are built on top of DeviceGuardImpls, analogous
- * to DeviceGuard and InlineDeviceGuard. The name "DeviceGuardImpls,"
- * is no longer entirely accurate, as these classes implement the
- * backend-specific logic for a generic backend interface.
- *
- * See DeviceGuardImplInterface.h for a list of all supported flags.
- */
+enum class EventFlag { PYTORCH_DEFAULT, BACKEND_DEFAULT, INVALID };
 
-#ifdef PADDLE_WITH_CUDA
-
-class EventPool {
+struct Event final {
  public:
-  EventPool();
-  EventPool(const EventPool &) = delete;
-  EventPool(EventPool &&) = delete;
-  ~EventPool();
-
-  cudaEvent_t CreateCudaEventFromPool();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
+  using BackendEvent = hipEvent_t;
+  using BackendStream = hipStream_t;
+#else
+  using BackendEvent = cudaEvent_t;
+  using BackendStream = cudaStream_t;
+#endif
+#endif
 
-  static EventPool &Instance();
+  Event() = delete;
+  Event(const DeviceType device_type,
+        const EventFlag flag = EventFlag::PYTORCH_DEFAULT)
+      : device_type_(device_type), flag_(flag) {}
 
- private:
-  std::queue<cudaEvent_t> incomplished_events_;
-  std::mutex mtx_;
-};
+  Event(const Event&) = delete;
+  Event& operator=(const Event&) = delete;
 
-EventPool &EventPool::Instance() {
-  static EventPool pool;
-  return pool;
-}
+  Event(Event&& other) noexcept { MoveFrom(std::move(other)); }
+  Event& operator=(Event&& other) noexcept {
+    if (this != &other) {
+      DestroyBackendEvent();
+      MoveFrom(std::move(other));
+    }
+    return *this;
+  }
 
-EventPool::EventPool() {
-  for (size_t i = 0; i < 1000; ++i) {
-    cudaEvent_t new_event;
-    C10_CUDA_CHECK(cudaEventCreate(&new_event));
+  ~Event() { DestroyBackendEvent(); }
 
-    cudaEventRecord(new_event, 0);
-    incomplished_events_.push(new_event);
+  Device device() const noexcept { return Device(device_type_, device_index_); }
+  DeviceType device_type() const noexcept { return device_type_; }
+  DeviceIndex device_index() const noexcept { return device_index_; }
+  EventFlag flag() const noexcept { return flag_; }
+  bool was_marked_for_recording() const noexcept {
+    return was_marked_for_recording_;
   }
-}
-
-EventPool::~EventPool() {
-  std::unique_lock<std::mutex> lock(mtx_);
-  while (!incomplished_events_.empty()) {
-    cudaEvent_t event = incomplished_events_.front();
-    incomplished_events_.pop();
-    if (cudaEventQuery(event) == cudaSuccess) {
-      C10_CUDA_CHECK(cudaEventDestroy(event));
+
+  void recordOnce(const Stream& stream) {
+    if (!was_marked_for_recording_) {
+      record(stream);
     }
   }
-}
-
-cudaEvent_t EventPool::CreateCudaEventFromPool() {
-  std::unique_lock<std::mutex> lock(mtx_);
-
-  const auto &CreateNewEvent = [&]() -> cudaEvent_t {
-    cudaEvent_t new_event;
-    C10_CUDA_CHECK(cudaEventCreate(&new_event));
-    incomplished_events_.push(new_event);
-    return new_event;
-  };
-
-  const auto &CreateNewOrReuseEvent = [&]() -> cudaEvent_t {
-    cudaEvent_t front_event = incomplished_events_.front();
-    incomplished_events_.pop();
-    incomplished_events_.push(front_event);
-    if (cudaEventQuery(front_event) == cudaSuccess) {
-      return front_event;
+
+  void record(const Stream& stream) {
+    TORCH_CHECK(stream.device_type() == device_type_,
+                "Event device type ",
+                device_type_,
+                " does not match recording stream's device type ",
+                stream.device_type(),
+                ".");
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (device_type_ == DeviceType::CUDA) {
+      RecordBackendEvent(static_cast<BackendStream>(stream.native_handle()),
+                         stream.device_index());
+      return;
     }
-    return CreateNewEvent();
-  };
+#endif
+    TORCH_CHECK(false, "Backend doesn't support events.");
+  }
 
-  if (incomplished_events_.empty()) {
-    return CreateNewEvent();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); }
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
+  // c10::Stream
+  void record(const cudaStream_t& stream) {
+    TORCH_CHECK(
+        device_type_ == DeviceType::CUDA,
+        "Raw cudaStream_t recording is only supported for CUDA events.");
+    RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
   }
-  return CreateNewOrReuseEvent();
-}
+#endif
 
-struct Event final {
- public:
-  Event(const DeviceType &type) {
-    // device_type is useless, only for compatibility
-    cuda_event_ = EventPool::Instance().CreateCudaEventFromPool();
+  void block(const Stream& stream) const {
+    if (!was_marked_for_recording_) {
+      return;
+    }
+    TORCH_CHECK(stream.device_type() == device_type_,
+                "Event device type ",
+                device_type_,
+                " does not match blocking stream's device type ",
+                stream.device_type(),
+                ".");
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (device_type_ == DeviceType::CUDA && backend_event_) {
+      TORCH_CHECK(device_index_ == stream.device_index(),
+                  "Event device index ",
+                  static_cast<int>(device_index_),
+                  " does not match blocking stream's device index ",
+                  static_cast<int>(stream.device_index()),
+                  ".");
+      c10::cuda::CUDAGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipStreamWaitEvent(static_cast<BackendStream>(stream.native_handle()),
+                             backend_event_,
+                             0));
+#else
+      C10_CUDA_CHECK(cudaStreamWaitEvent(
+          static_cast<BackendStream>(stream.native_handle()),
+          backend_event_,
+          0));
+#endif
+      return;
+    }
+#endif
+    TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
-  void record(const Stream &stream) {
-    C10_CUDA_CHECK(cudaEventRecord(
-        cuda_event_, static_cast<cudaStream_t>(stream.native_handle())));
+  bool query() const {
+    if (!was_marked_for_recording_) {
+      return true;
+    }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (device_type_ == DeviceType::CUDA && backend_event_) {
+#ifdef PADDLE_WITH_HIP
+      const auto err = hipEventQuery(backend_event_);
+      if (err == hipSuccess) {
+        return true;
+      }
+      if (err != hipErrorNotReady) {
+        PADDLE_ENFORCE_GPU_SUCCESS(err);
+      } else {
+        (void)hipGetLastError();
+      }
+#else
+      const auto err = cudaEventQuery(backend_event_);
+      if (err == cudaSuccess) {
+        return true;
+      }
+      if (err != cudaErrorNotReady) {
+        C10_CUDA_CHECK(err);
+      } else {
+        (void)cudaGetLastError();
+      }
+#endif
+      return false;
+    }
+#endif
+    TORCH_CHECK(false, "Backend doesn't support events.");
+    return true;
   }
 
-  void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); }
+  double elapsedTime(const Event& event) const {
+    TORCH_CHECK(event.device_type() == device_type_,
+                "Event device type ",
+                device_type_,
+                " does not match other's device type ",
+                event.device_type(),
+                ".");
+    TORCH_CHECK(
+        flag_ == EventFlag::BACKEND_DEFAULT &&
+            event.flag_ == EventFlag::BACKEND_DEFAULT,
+        "Both events must be created with argument 'enable_timing=True'.");
+    TORCH_CHECK(
+        was_marked_for_recording_ && event.was_marked_for_recording_,
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && event.query(),
+        "Both events must be completed before calculating elapsed time.");
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (device_type_ == DeviceType::CUDA && backend_event_ &&
+        event.backend_event_) {
+      TORCH_CHECK(device_index_ == event.device_index_,
+                  "Event device index ",
+                  static_cast<int>(device_index_),
+                  " does not match other's device index ",
+                  static_cast<int>(event.device_index_),
+                  ".");
+      c10::cuda::CUDAGuard guard(device_index_);
+      float time_ms = 0.0f;
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipEventElapsedTime(&time_ms, backend_event_, event.backend_event_));
+#else
+      C10_CUDA_CHECK(
+          cudaEventElapsedTime(&time_ms, backend_event_, event.backend_event_));
+#endif
+      return static_cast<double>(time_ms);
+    }
+#endif
+    TORCH_CHECK(false, "Backend doesn't support event elapsedTime.");
+    return 0.0;
+  }
 
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // c10::Stream
-  void record(const cudaStream_t &stream) {
-    C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream));
+  void* eventId() const {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    return backend_event_;
+#else
+    return nullptr;
+#endif
   }
 
-  void block(const Stream &stream) const {
-    C10_CUDA_CHECK(cudaStreamWaitEvent(
-        static_cast<cudaStream_t>(stream.native_handle()), cuda_event_, 0));
+  void synchronize() const {
+    if (!was_marked_for_recording_) {
+      return;
+    }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (device_type_ == DeviceType::CUDA && backend_event_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(backend_event_));
+#else
+      C10_CUDA_CHECK(cudaEventSynchronize(backend_event_));
+#endif
+      return;
+    }
+#endif
+    TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
-  cudaEvent_t cuda_event() const { return cuda_event_; }
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t cuda_event() const { return backend_event_; }
+#endif
 
  private:
-  cudaEvent_t cuda_event_;
+  DeviceType device_type_;
+  DeviceIndex device_index_ = -1;
+  EventFlag flag_ = EventFlag::PYTORCH_DEFAULT;
+  bool was_marked_for_recording_ = false;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  BackendEvent backend_event_ = nullptr;
+
+  static unsigned int BackendEventCreateFlags(EventFlag flag) {
+    switch (flag) {
+      case EventFlag::PYTORCH_DEFAULT:
+#ifdef PADDLE_WITH_HIP
+        return hipEventDisableTiming;
+#else
+        return cudaEventDisableTiming;
+#endif
+      case EventFlag::BACKEND_DEFAULT:
+#ifdef PADDLE_WITH_HIP
+        return hipEventDefault;
+#else
+        return cudaEventDefault;
+#endif
+      default:
+        TORCH_CHECK(false, "CUDA event received unknown flag");
+    }
+  }
+
+  void EnsureBackendEventCreated(DeviceIndex stream_device_index) {
+    if (backend_event_) {
+      return;
+    }
+    c10::cuda::CUDAGuard guard(stream_device_index);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(
+        &backend_event_, BackendEventCreateFlags(flag_)));
+#else
+    C10_CUDA_CHECK(cudaEventCreateWithFlags(&backend_event_,
+                                            BackendEventCreateFlags(flag_)));
+#endif
+  }
+
+  void RecordBackendEvent(BackendStream stream,
+                          DeviceIndex stream_device_index) {
+    TORCH_CHECK(device_index_ == -1 || device_index_ == stream_device_index,
+                "Event device index ",
+                static_cast<int>(device_index_),
+                " does not match recording stream's device index ",
+                static_cast<int>(stream_device_index),
+                ".");
+    EnsureBackendEventCreated(stream_device_index);
+    c10::cuda::CUDAGuard guard(stream_device_index);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(backend_event_, stream));
+#else
+    C10_CUDA_CHECK(cudaEventRecord(backend_event_, stream));
+#endif
+    device_index_ = stream_device_index;
+    was_marked_for_recording_ = true;
+  }
+
+  void DestroyBackendEvent() noexcept {
+    if (!backend_event_) {
+      return;
+    }
+    try {
+      c10::cuda::CUDAGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(backend_event_));
+#else
+      C10_CUDA_CHECK(cudaEventDestroy(backend_event_));
+#endif
+    } catch (...) {
+    }
+    backend_event_ = nullptr;
+  }
+#else
+  void DestroyBackendEvent() noexcept {}
+#endif
+
+  void MoveFrom(Event&& other) noexcept {
+    device_type_ = other.device_type_;
+    device_index_ = other.device_index_;
+    flag_ = other.flag_;
+    was_marked_for_recording_ = other.was_marked_for_recording_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    backend_event_ = std::exchange(other.backend_event_, nullptr);
+#endif
+    other.device_index_ = -1;
+    other.was_marked_for_recording_ = false;
+  }
 };
 
 }  // namespace c10
@@ -158,5 +344,3 @@ struct Event final {
 namespace torch {
 using c10::Event;
 }  // namespace torch
-
-#endif
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
index f88dd043317252..364a634f9abecf 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
@@ -74,7 +74,7 @@ inline StreamPoolState& get_pool(int device_index) {
 inline void init_pool(int device_index, StreamPoolState* state) {
   phi::backends::gpu::GPUDeviceGuard guard(device_index);
   int lo_pri = 0, hi_pri = 0;
-  cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri);
+  C10_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
   for (int i = 0; i < kStreamsPerPool; ++i) {
     C10_CUDA_CHECK(cudaStreamCreateWithPriority(
         &state->low_priority[i], cudaStreamNonBlocking, lo_pri));
@@ -192,6 +192,9 @@ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) {
  * record_stream lifetime semantics.
  */
 inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
+                                    c10::DeviceIndex device_index = -1);
+
+inline CUDAStream getStreamFromPool(const int priority,
                                     c10::DeviceIndex device_index = -1) {
   if (device_index == -1) {
     device_index = phi::backends::gpu::GetCurrentDeviceId();
@@ -203,7 +206,9 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
   });
 
   cudaStream_t raw;
-  if (isHighPriority) {
+  // Keep parity with PyTorch API shape: negative priority selects the
+  // high-priority pool, non-negative selects the low-priority pool.
+  if (priority < 0) {
     raw = state.high_priority[state.hp_counter.fetch_add(1) %
                               detail::kStreamsPerPool];
   } else {
@@ -213,6 +218,11 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false,
   return make_cuda_stream(raw, device_index);
 }
 
+inline CUDAStream getStreamFromPool(const bool isHighPriority,
+                                    c10::DeviceIndex device_index) {
+  return getStreamFromPool(isHighPriority ? -1 : 0, device_index);
+}
+
 /**
  * Set the current CUDA stream for the device of the given stream in the
  * calling thread.
diff --git a/test/cpp/compat/ATen_record_stream_test.cc b/test/cpp/compat/ATen_record_stream_test.cc
index db895005714d0e..8be51d243d1022 100644
--- a/test/cpp/compat/ATen_record_stream_test.cc
+++ b/test/cpp/compat/ATen_record_stream_test.cc
@@ -47,6 +47,14 @@ class RecordStreamTest : public ::testing::Test {
 
 // --- Happy path: CUDA tensor + current CUDA stream should succeed ---
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const;
+[[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method =
+    &at::Tensor::record_stream;
+
+using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const;
+[[maybe_unused]] static RecordRawCudaStreamMethod
+    g_record_raw_cuda_stream_method = &at::Tensor::record_stream;
+
 TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) {
   SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   auto stream = at::cuda::getCurrentCUDAStream();
@@ -60,6 +68,12 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) {
   c10::Stream default_stream = c10::cuda::getDefaultCUDAStream().unwrap();
   EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream));
 }
+
+TEST_F(RecordStreamTest, CudaTensorRawCudaStream) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream()));
+}
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
 
 // --- Error path: CPU tensor + CPU stream (record_stream does not support CPU
diff --git a/test/cpp/compat/CMakeLists.txt b/test/cpp/compat/CMakeLists.txt
index 1e02c599f031f6..e771d03fb431ef 100644
--- a/test/cpp/compat/CMakeLists.txt
+++ b/test/cpp/compat/CMakeLists.txt
@@ -22,6 +22,7 @@ if(NOT WIN32)
     nv_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc)
     nv_test(c10_storage_test SRCS c10_storage_test.cc)
     nv_test(c10_Stream_test SRCS c10_Stream_test.cc)
+    nv_test(c10_Event_test SRCS c10_Event_test.cc)
     nv_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc)
     nv_test(c10_layout_test SRCS c10_layout_test.cc)
     nv_test(ATen_clamp_test SRCS ATen_clamp_test.cc)
diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc
new file mode 100644
index 00000000000000..a6933e06efea36
--- /dev/null
+++ b/test/cpp/compat/c10_Event_test.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <c10/core/Event.h>
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
+#include "gtest/gtest.h"
+#include "test/cpp/compat/cuda_test_utils.h"
+
+TEST(EventTest, CpuEventDefaultProperties) {
+  c10::Event event(c10::DeviceType::CPU);
+  EXPECT_EQ(event.device_type(), c10::DeviceType::CPU);
+  EXPECT_EQ(event.device_index(), -1);
+  EXPECT_EQ(event.flag(), c10::EventFlag::PYTORCH_DEFAULT);
+  EXPECT_FALSE(event.was_marked_for_recording());
+  EXPECT_TRUE(event.query());
+  EXPECT_EQ(event.eventId(), nullptr);
+}
+
+TEST(EventTest, CpuEventRecordThrows) {
+  c10::Event event(c10::DeviceType::CPU);
+  c10::Stream stream(c10::Stream::DEFAULT,
+                     c10::Device(c10::DeviceType::CPU, 0));
+  EXPECT_THROW(event.record(stream), std::exception);
+  EXPECT_THROW(event.recordOnce(stream), std::exception);
+}
+
+#ifdef PADDLE_WITH_CUDA
+using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&);
+[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method =
+    &c10::Event::record;
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(EventTest, CudaEventLazyCreateAndRecord) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  c10::Event event(c10::DeviceType::CUDA);
+  auto stream = c10::cuda::getCurrentCUDAStream();
+
+  EXPECT_EQ(event.device_index(), -1);
+  EXPECT_EQ(event.eventId(), nullptr);
+  EXPECT_FALSE(event.was_marked_for_recording());
+
+  EXPECT_NO_THROW(event.record(stream));
+  EXPECT_EQ(event.device_index(), stream.device_index());
+  EXPECT_NE(event.eventId(), nullptr);
+  EXPECT_TRUE(event.was_marked_for_recording());
+
+  EXPECT_NO_THROW(event.synchronize());
+  EXPECT_TRUE(event.query());
+}
+
+TEST(EventTest, CudaEventElapsedTimeRequiresTimingFlag) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto stream = c10::cuda::getCurrentCUDAStream();
+  c10::Event start(c10::DeviceType::CUDA);
+  c10::Event end(c10::DeviceType::CUDA);
+
+  start.record(stream);
+  end.record(stream);
+  end.synchronize();
+
+  EXPECT_THROW(start.elapsedTime(end), std::exception);
+}
+
+TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto stream = c10::cuda::getCurrentCUDAStream();
+  c10::Event start(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT);
+  c10::Event end(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT);
+
+  start.record(stream);
+  end.record(stream);
+  end.synchronize();
+
+  double elapsed_ms = -1.0;
+  EXPECT_NO_THROW(elapsed_ms = start.elapsedTime(end));
+  EXPECT_GE(elapsed_ms, 0.0);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(EventTest, CudaEventRawStreamRecordCompatibility) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  auto stream = c10::cuda::getCurrentCUDAStream();
+  c10::Event event(c10::DeviceType::CUDA);
+  EXPECT_NO_THROW(event.record(stream.raw_stream()));
+  EXPECT_EQ(event.device_index(), stream.device_index());
+  EXPECT_TRUE(event.was_marked_for_recording());
+}
+#endif
+
+TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) {
+  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+  if (c10::cuda::device_count() < 2) {
+    return;
+  }
+
+  c10::Event event(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT);
+  auto stream0 = c10::cuda::getDefaultCUDAStream(0);
+  auto stream1 = c10::cuda::getDefaultCUDAStream(1);
+
+  EXPECT_NO_THROW(event.record(stream0));
+  EXPECT_THROW(event.record(stream1), std::exception);
+}
+#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP