From 2f67c076cecd391fa7e758a4032e2e2ed7e08ec4 Mon Sep 17 00:00:00 2001 From: youge325 Date: Tue, 24 Mar 2026 22:26:10 +0800 Subject: [PATCH 1/8] [Cpp API Compatibility] remove CUDAStream::raw_stream() --- .../api/include/compat/c10/cuda/CUDAStream.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h index f88dd043317252..a6c1e95b61ef0f 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h @@ -74,7 +74,7 @@ inline StreamPoolState& get_pool(int device_index) { inline void init_pool(int device_index, StreamPoolState* state) { phi::backends::gpu::GPUDeviceGuard guard(device_index); int lo_pri = 0, hi_pri = 0; - cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri); + C10_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri)); for (int i = 0; i < kStreamsPerPool; ++i) { C10_CUDA_CHECK(cudaStreamCreateWithPriority( &state->low_priority[i], cudaStreamNonBlocking, lo_pri)); @@ -134,10 +134,6 @@ class CUDAStream { Device device() const { return Device(DeviceType::CUDA, device_index()); } - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // stream() - cudaStream_t raw_stream() const { return stream(); } - private: Stream stream_; }; @@ -192,6 +188,9 @@ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) { * record_stream lifetime semantics. */ inline CUDAStream getStreamFromPool(const bool isHighPriority = false, + c10::DeviceIndex device_index = -1); + +inline CUDAStream getStreamFromPool(const int priority, c10::DeviceIndex device_index = -1) { if (device_index == -1) { device_index = phi::backends::gpu::GetCurrentDeviceId(); @@ -203,7 +202,9 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false, }); cudaStream_t raw; - if (isHighPriority) { + // Keep parity with PyTorch API shape: negative priority selects the + // high-priority pool, non-negative selects the low-priority pool. + if (priority < 0) { raw = state.high_priority[state.hp_counter.fetch_add(1) % detail::kStreamsPerPool]; } else { @@ -213,6 +214,11 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false, return make_cuda_stream(raw, device_index); } +inline CUDAStream getStreamFromPool(const bool isHighPriority, + c10::DeviceIndex device_index) { + return getStreamFromPool(isHighPriority ? -1 : 0, device_index); +} + /** * Set the current CUDA stream for the device of the given stream in the * calling thread. From 2e43d1f1331fea104c1071516ba575189e3743b8 Mon Sep 17 00:00:00 2001 From: youge325 Date: Wed, 25 Mar 2026 19:51:27 +0800 Subject: [PATCH 2/8] remove some other APIs --- .../phi/api/include/compat/ATen/core/TensorBody.h | 6 ------ .../phi/api/include/compat/ATen/ops/record_stream.h | 13 ------------- paddle/phi/api/include/compat/c10/core/Event.h | 6 ------ 3 files changed, 25 deletions(-) diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h index a12dfba80b5bec..6445befda15ee5 100644 --- a/paddle/phi/api/include/compat/ATen/core/TensorBody.h +++ b/paddle/phi/api/include/compat/ATen/core/TensorBody.h @@ -687,12 +687,6 @@ class Tensor : public TensorBase { } void record_stream(at::Stream s) const; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - void record_stream(at::cuda::CUDAStream s) const; - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // at::Stream - void record_stream(cudaStream_t s) const; -#endif Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); } diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h index 73cb5dd4b2247c..ea6a21d98140f5 100644 --- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h +++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h @@ -54,17 +54,4 @@ inline void Tensor::record_stream(at::cuda::CUDAStream s) const { record_stream(static_cast(s)); } -// TODO(youge325): Remove after DeepEP paddle branch is updated to use -// at::Stream -inline void Tensor::record_stream(cudaStream_t s) const { - auto dense_tensor = - std::dynamic_pointer_cast(tensor_.impl()); - PD_CHECK(dense_tensor != nullptr, - "record_stream only supports DenseTensor, but got a non-dense " - "tensor implementation."); - PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU, - "record_stream is not supported for CPU tensors."); - paddle::memory::RecordStream(dense_tensor->Holder(), s); -} -#endif } // namespace at diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h index 5d2c2d10b710d5..2c58fe97587245 100644 --- a/paddle/phi/api/include/compat/c10/core/Event.h +++ b/paddle/phi/api/include/compat/c10/core/Event.h @@ -136,12 +136,6 @@ struct Event final { void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); } - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // c10::Stream - void record(const cudaStream_t &stream) { - C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream)); - } - void block(const Stream &stream) const { C10_CUDA_CHECK(cudaStreamWaitEvent( static_cast(stream.native_handle()), cuda_event_, 0)); From e6d3638ae6c257b7de75d28700ddabdbf4ba5a3f Mon Sep 17 00:00:00 2001 From: youge325 Date: Wed, 25 Mar 2026 20:01:22 +0800 Subject: [PATCH 3/8] remove useless annotation --- .../phi/api/include/compat/c10/core/Event.h | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h index 2c58fe97587245..04ea3c7eaae0f1 100644 --- a/paddle/phi/api/include/compat/c10/core/Event.h +++ b/paddle/phi/api/include/compat/c10/core/Event.h @@ -20,38 +20,6 @@ #include namespace c10 { -/** - * A backend-generic movable, not copyable, not thread-safe event. - * - * The design of this event follows that of CUDA and HIP events. These events - * are recorded and waited on by streams and can be rerecorded to, - * each rerecording essentially creating a new version of the event. - * For example, if (in CPU time), stream X is asked to record E, - * stream Y waits on E, and stream X is asked to record E again, then Y will - * wait for X to finish the first call to record and not the second, because - * it's waiting on the first version of event E, not the second. - * Querying an event only returns the status of its most recent version. - * - * Backend-generic events are implemented by this class and - * impl::InlineEvent. In addition to these events there are also - * some backend-specific events, like ATen's CUDAEvent. Each of these - * classes has its own use. - * - * impl::InlineEvent<...> or a backend-specific event should be - * preferred when the backend is known at compile time and known to - * be compiled. Backend-specific events may have additional functionality. - * - * This Event should be used if a particular backend may not be available, - * or the backend required is not known at compile time. - * - * These generic events are built on top of DeviceGuardImpls, analogous - * to DeviceGuard and InlineDeviceGuard. The name "DeviceGuardImpls," - * is no longer entirely accurate, as these classes implement the - * backend-specific logic for a generic backend interface. - * - * See DeviceGuardImplInterface.h for a list of all supported flags. - */ - #ifdef PADDLE_WITH_CUDA class EventPool { From 91428a612ff98e5d4b65fabffa6a4d16719cedf9 Mon Sep 17 00:00:00 2001 From: youge325 Date: Wed, 25 Mar 2026 20:24:52 +0800 Subject: [PATCH 4/8] using alias for c10::Stream --- paddle/phi/api/include/compat/c10/core/Stream.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h index 58912130daf303..e000ca9ac13650 100644 --- a/paddle/phi/api/include/compat/c10/core/Stream.h +++ b/paddle/phi/api/include/compat/c10/core/Stream.h @@ -106,3 +106,7 @@ struct hash { } }; } // namespace std + +namespace at { +using c10::Stream; +} From dc7610a09245716a88b4db070a63c1d3d4681a97 Mon Sep 17 00:00:00 2001 From: youge325 Date: Wed, 25 Mar 2026 20:31:29 +0800 Subject: [PATCH 5/8] fix compiling error --- paddle/phi/api/include/compat/ATen/ops/record_stream.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h index ea6a21d98140f5..ff43391cdc796c 100644 --- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h +++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h @@ -49,9 +49,4 @@ inline void Tensor::record_stream(at::Stream s) const { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -inline void Tensor::record_stream(at::cuda::CUDAStream s) const { - record_stream(static_cast(s)); -} - } // namespace at From 7b70cb059e0762aa709beebc590287b1c83a90be Mon Sep 17 00:00:00 2001 From: youge325 Date: Wed, 1 Apr 2026 16:18:57 +0800 Subject: [PATCH 6/8] align Event related APIs --- .../phi/api/include/compat/c10/core/Event.h | 114 ++++++++++++++++-- 1 file changed, 104 insertions(+), 10 deletions(-) diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h index 04ea3c7eaae0f1..b880c28122705e 100644 --- a/paddle/phi/api/include/compat/c10/core/Event.h +++ b/paddle/phi/api/include/compat/c10/core/Event.h @@ -16,10 +16,17 @@ #include #include +#include + +#ifdef PADDLE_WITH_CUDA #include #include +#endif + namespace c10 { +enum class EventFlag { PYTORCH_DEFAULT, BACKEND_DEFAULT, INVALID }; + #ifdef PADDLE_WITH_CUDA class EventPool { @@ -90,29 +97,118 @@ cudaEvent_t EventPool::CreateCudaEventFromPool() { return CreateNewOrReuseEvent(); } +#endif // PADDLE_WITH_CUDA + struct Event final { public: - Event(const DeviceType &type) { - // device_type is useless, only for compatibility - cuda_event_ = EventPool::Instance().CreateCudaEventFromPool(); + Event() = delete; + Event(const DeviceType device_type, + const EventFlag flag = EventFlag::PYTORCH_DEFAULT) + : device_type_(device_type), flag_(flag) { +#ifdef PADDLE_WITH_CUDA + if (device_type == DeviceType::CUDA) { + cuda_event_ = EventPool::Instance().CreateCudaEventFromPool(); + } +#endif + } + + Event(const Event &) = delete; + Event &operator=(const Event &) = delete; + Event(Event &&) = default; + Event &operator=(Event &&) = default; + ~Event() = default; + + Device device() const noexcept { return Device(device_type_, device_index_); } + DeviceType device_type() const noexcept { return device_type_; } + DeviceIndex device_index() const noexcept { return device_index_; } + EventFlag flag() const noexcept { return flag_; } + bool was_marked_for_recording() const noexcept { + return was_marked_for_recording_; + } + + void recordOnce(const Stream &stream) { + if (!was_marked_for_recording_) record(stream); } void record(const Stream &stream) { - C10_CUDA_CHECK(cudaEventRecord( - cuda_event_, static_cast(stream.native_handle()))); + TORCH_CHECK( + stream.device_type() == device_type_, + "Event device type does not match recording stream's device type."); +#ifdef PADDLE_WITH_CUDA + if (device_type_ == DeviceType::CUDA && cuda_event_) { + C10_CUDA_CHECK(cudaEventRecord( + cuda_event_, static_cast(stream.native_handle()))); + was_marked_for_recording_ = true; + device_index_ = stream.device_index(); + return; + } +#endif + TORCH_CHECK(false, "Backend doesn't support events."); } +#ifdef PADDLE_WITH_CUDA void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); } +#endif void block(const Stream &stream) const { - C10_CUDA_CHECK(cudaStreamWaitEvent( - static_cast(stream.native_handle()), cuda_event_, 0)); + if (!was_marked_for_recording_) return; + TORCH_CHECK( + stream.device_type() == device_type_, + "Event device type does not match blocking stream's device type."); +#ifdef PADDLE_WITH_CUDA + if (device_type_ == DeviceType::CUDA && cuda_event_) { + C10_CUDA_CHECK(cudaStreamWaitEvent( + static_cast(stream.native_handle()), cuda_event_, 0)); + return; + } +#endif + TORCH_CHECK(false, "Backend doesn't support events."); } + bool query() const { + if (!was_marked_for_recording_) return true; +#ifdef PADDLE_WITH_CUDA + if (device_type_ == DeviceType::CUDA && cuda_event_) { + return cudaEventQuery(cuda_event_) == cudaSuccess; + } +#endif + TORCH_CHECK(false, "Backend doesn't support events."); + return true; + } + + double elapsedTime(const Event &event) const { + (void)event; + return 0.0; + } + + void *eventId() const { +#ifdef PADDLE_WITH_CUDA + return cuda_event_; +#else + return nullptr; +#endif + } + + void synchronize() const { +#ifdef PADDLE_WITH_CUDA + if (device_type_ == DeviceType::CUDA && cuda_event_) { + C10_CUDA_CHECK(cudaEventSynchronize(cuda_event_)); + } +#endif + } + +#ifdef PADDLE_WITH_CUDA cudaEvent_t cuda_event() const { return cuda_event_; } +#endif private: - cudaEvent_t cuda_event_; + DeviceType device_type_; + DeviceIndex device_index_ = -1; + EventFlag flag_ = EventFlag::PYTORCH_DEFAULT; + bool was_marked_for_recording_ = false; +#ifdef PADDLE_WITH_CUDA + cudaEvent_t cuda_event_ = nullptr; +#endif }; } // namespace c10 @@ -120,5 +216,3 @@ struct Event final { namespace torch { using c10::Event; } // namespace torch - -#endif From ce69b67d7d0750e0a32bd8741f941b1ff37afb5d Mon Sep 17 00:00:00 2001 From: youge325 Date: Thu, 2 Apr 2026 15:28:44 +0800 Subject: [PATCH 7/8] revert deleted stream APIs and fix Event --- .../api/include/compat/ATen/core/TensorBody.h | 6 + .../include/compat/ATen/ops/record_stream.h | 18 ++ .../phi/api/include/compat/c10/core/Event.h | 273 +++++++++++------- .../phi/api/include/compat/c10/core/Stream.h | 4 - .../api/include/compat/c10/cuda/CUDAStream.h | 4 + test/cpp/compat/ATen_record_stream_test.cc | 14 + test/cpp/compat/CMakeLists.txt | 1 + test/cpp/compat/c10_Event_test.cc | 118 ++++++++ 8 files changed, 329 insertions(+), 109 deletions(-) create mode 100644 test/cpp/compat/c10_Event_test.cc diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h index 6445befda15ee5..a12dfba80b5bec 100644 --- a/paddle/phi/api/include/compat/ATen/core/TensorBody.h +++ b/paddle/phi/api/include/compat/ATen/core/TensorBody.h @@ -687,6 +687,12 @@ class Tensor : public TensorBase { } void record_stream(at::Stream s) const; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void record_stream(at::cuda::CUDAStream s) const; + // TODO(youge325): Remove after DeepEP paddle branch is updated to use + // at::Stream + void record_stream(cudaStream_t s) const; +#endif Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); } diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h index ff43391cdc796c..73cb5dd4b2247c 100644 --- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h +++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h @@ -49,4 +49,22 @@ inline void Tensor::record_stream(at::Stream s) const { #endif } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +inline void Tensor::record_stream(at::cuda::CUDAStream s) const { + record_stream(static_cast(s)); +} + +// TODO(youge325): Remove after DeepEP paddle branch is updated to use +// at::Stream +inline void Tensor::record_stream(cudaStream_t s) const { + auto dense_tensor = + std::dynamic_pointer_cast(tensor_.impl()); + PD_CHECK(dense_tensor != nullptr, + "record_stream only supports DenseTensor, but got a non-dense " + "tensor implementation."); + PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU, + "record_stream is not supported for CPU tensors."); + paddle::memory::RecordStream(dense_tensor->Holder(), s); +} +#endif } // namespace at diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h index b880c28122705e..00919c7f13e300 100644 --- a/paddle/phi/api/include/compat/c10/core/Event.h +++ b/paddle/phi/api/include/compat/c10/core/Event.h @@ -18,105 +18,37 @@ #include #include +#include + #ifdef PADDLE_WITH_CUDA +#include #include -#include #endif namespace c10 { enum class EventFlag { PYTORCH_DEFAULT, BACKEND_DEFAULT, INVALID }; -#ifdef PADDLE_WITH_CUDA - -class EventPool { - public: - EventPool(); - EventPool(const EventPool &) = delete; - EventPool(EventPool &&) = delete; - ~EventPool(); - - cudaEvent_t CreateCudaEventFromPool(); - - static EventPool &Instance(); - - private: - std::queue incomplished_events_; - std::mutex mtx_; -}; - -EventPool &EventPool::Instance() { - static EventPool pool; - return pool; -} - -EventPool::EventPool() { - for (size_t i = 0; i < 1000; ++i) { - cudaEvent_t new_event; - C10_CUDA_CHECK(cudaEventCreate(&new_event)); - - cudaEventRecord(new_event, 0); - incomplished_events_.push(new_event); - } -} - -EventPool::~EventPool() { - std::unique_lock lock(mtx_); - while (!incomplished_events_.empty()) { - cudaEvent_t event = incomplished_events_.front(); - incomplished_events_.pop(); - if (cudaEventQuery(event) == cudaSuccess) { - C10_CUDA_CHECK(cudaEventDestroy(event)); - } - } -} - -cudaEvent_t EventPool::CreateCudaEventFromPool() { - std::unique_lock lock(mtx_); - - const auto &CreateNewEvent = [&]() -> cudaEvent_t { - cudaEvent_t new_event; - C10_CUDA_CHECK(cudaEventCreate(&new_event)); - incomplished_events_.push(new_event); - return new_event; - }; - - const auto &CreateNewOrReuseEvent = [&]() -> cudaEvent_t { - cudaEvent_t front_event = incomplished_events_.front(); - incomplished_events_.pop(); - incomplished_events_.push(front_event); - if (cudaEventQuery(front_event) == cudaSuccess) { - return front_event; - } - return CreateNewEvent(); - }; - - if (incomplished_events_.empty()) { - return CreateNewEvent(); - } - return CreateNewOrReuseEvent(); -} - -#endif // PADDLE_WITH_CUDA - struct Event final { public: Event() = delete; Event(const DeviceType device_type, const EventFlag flag = EventFlag::PYTORCH_DEFAULT) - : device_type_(device_type), flag_(flag) { -#ifdef PADDLE_WITH_CUDA - if (device_type == DeviceType::CUDA) { - cuda_event_ = EventPool::Instance().CreateCudaEventFromPool(); + : device_type_(device_type), flag_(flag) {} + + Event(const Event&) = delete; + Event& operator=(const Event&) = delete; + + Event(Event&& other) noexcept { MoveFrom(std::move(other)); } + Event& operator=(Event&& other) noexcept { + if (this != &other) { + DestroyCudaEvent(); + MoveFrom(std::move(other)); } -#endif + return *this; } - Event(const Event &) = delete; - Event &operator=(const Event &) = delete; - Event(Event &&) = default; - Event &operator=(Event &&) = default; - ~Event() = default; + ~Event() { DestroyCudaEvent(); } Device device() const noexcept { return Device(device_type_, device_index_); } DeviceType device_type() const noexcept { return device_type_; } @@ -126,20 +58,23 @@ struct Event final { return was_marked_for_recording_; } - void recordOnce(const Stream &stream) { - if (!was_marked_for_recording_) record(stream); + void recordOnce(const Stream& stream) { + if (!was_marked_for_recording_) { + record(stream); + } } - void record(const Stream &stream) { - TORCH_CHECK( - stream.device_type() == device_type_, - "Event device type does not match recording stream's device type."); + void record(const Stream& stream) { + TORCH_CHECK(stream.device_type() == device_type_, + "Event device type ", + device_type_, + " does not match recording stream's device type ", + stream.device_type(), + "."); #ifdef PADDLE_WITH_CUDA - if (device_type_ == DeviceType::CUDA && cuda_event_) { - C10_CUDA_CHECK(cudaEventRecord( - cuda_event_, static_cast(stream.native_handle()))); - was_marked_for_recording_ = true; - device_index_ = stream.device_index(); + if (device_type_ == DeviceType::CUDA) { + RecordCudaEvent(static_cast(stream.native_handle()), + stream.device_index()); return; } #endif @@ -147,16 +82,37 @@ struct Event final { } #ifdef PADDLE_WITH_CUDA - void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); } -#endif + void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); } - void block(const Stream &stream) const { - if (!was_marked_for_recording_) return; + // TODO(youge325): Remove after DeepEP paddle branch is updated to use + // c10::Stream + void record(const cudaStream_t& stream) { TORCH_CHECK( - stream.device_type() == device_type_, - "Event device type does not match blocking stream's device type."); + device_type_ == DeviceType::CUDA, + "Raw cudaStream_t recording is only supported for CUDA events."); + RecordCudaEvent(stream, phi::backends::gpu::GetCurrentDeviceId()); + } +#endif + + void block(const Stream& stream) const { + if (!was_marked_for_recording_) { + return; + } + TORCH_CHECK(stream.device_type() == device_type_, + "Event device type ", + device_type_, + " does not match blocking stream's device type ", + stream.device_type(), + "."); #ifdef PADDLE_WITH_CUDA if (device_type_ == DeviceType::CUDA && cuda_event_) { + TORCH_CHECK(device_index_ == stream.device_index(), + "Event device index ", + static_cast(device_index_), + " does not match blocking stream's device index ", + static_cast(stream.device_index()), + "."); + c10::cuda::CUDAGuard guard(device_index_); C10_CUDA_CHECK(cudaStreamWaitEvent( static_cast(stream.native_handle()), cuda_event_, 0)); return; @@ -166,22 +122,64 @@ struct Event final { } bool query() const { - if (!was_marked_for_recording_) return true; + if (!was_marked_for_recording_) { + return true; + } #ifdef PADDLE_WITH_CUDA if (device_type_ == DeviceType::CUDA && cuda_event_) { - return cudaEventQuery(cuda_event_) == cudaSuccess; + const auto err = cudaEventQuery(cuda_event_); + if (err == cudaSuccess) { + return true; + } + if (err != cudaErrorNotReady) { + C10_CUDA_CHECK(err); + } else { + (void)cudaGetLastError(); + } + return false; } #endif TORCH_CHECK(false, "Backend doesn't support events."); return true; } - double elapsedTime(const Event &event) const { - (void)event; + double elapsedTime(const Event& event) const { + TORCH_CHECK(event.device_type() == device_type_, + "Event device type ", + device_type_, + " does not match other's device type ", + event.device_type(), + "."); + TORCH_CHECK( + flag_ == EventFlag::BACKEND_DEFAULT && + event.flag_ == EventFlag::BACKEND_DEFAULT, + "Both events must be created with argument 'enable_timing=True'."); + TORCH_CHECK( + was_marked_for_recording_ && event.was_marked_for_recording_, + "Both events must be recorded before calculating elapsed time."); + TORCH_CHECK( + query() && event.query(), + "Both events must be completed before calculating elapsed time."); +#ifdef PADDLE_WITH_CUDA + if (device_type_ == DeviceType::CUDA && cuda_event_ && event.cuda_event_) { + TORCH_CHECK(device_index_ == event.device_index_, + "Event device index ", + static_cast(device_index_), + " does not match other's device index ", + static_cast(event.device_index_), + "."); + c10::cuda::CUDAGuard guard(device_index_); + float time_ms = 0.0f; + C10_CUDA_CHECK( + cudaEventElapsedTime(&time_ms, cuda_event_, event.cuda_event_)); + return static_cast(time_ms); + } +#endif + TORCH_CHECK(false, "Backend doesn't support event elapsedTime."); return 0.0; } - void *eventId() const { + void* eventId() const { #ifdef PADDLE_WITH_CUDA return cuda_event_; #else @@ -190,11 +188,16 @@ struct Event final { } void synchronize() const { + if (!was_marked_for_recording_) { + return; + } #ifdef PADDLE_WITH_CUDA if (device_type_ == DeviceType::CUDA && cuda_event_) { C10_CUDA_CHECK(cudaEventSynchronize(cuda_event_)); + return; } #endif + TORCH_CHECK(false, "Backend doesn't support events."); } #ifdef PADDLE_WITH_CUDA @@ -208,7 +211,67 @@ struct Event final { bool was_marked_for_recording_ = false; #ifdef PADDLE_WITH_CUDA cudaEvent_t cuda_event_ = nullptr; + + static unsigned int CudaEventCreateFlags(EventFlag flag) { + switch (flag) { + case EventFlag::PYTORCH_DEFAULT: + return cudaEventDisableTiming; + case EventFlag::BACKEND_DEFAULT: + return cudaEventDefault; + default: + TORCH_CHECK(false, "CUDA event received unknown flag"); + } + } + + void EnsureCudaEventCreated(DeviceIndex stream_device_index) { + if (cuda_event_) { + return; + } + c10::cuda::CUDAGuard guard(stream_device_index); + C10_CUDA_CHECK( + cudaEventCreateWithFlags(&cuda_event_, CudaEventCreateFlags(flag_))); + } + + void RecordCudaEvent(cudaStream_t stream, DeviceIndex stream_device_index) { + TORCH_CHECK(device_index_ == -1 || device_index_ == stream_device_index, + "Event device index ", + static_cast(device_index_), + " does not match recording stream's device index ", + static_cast(stream_device_index), + "."); + EnsureCudaEventCreated(stream_device_index); + c10::cuda::CUDAGuard guard(stream_device_index); + C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream)); + device_index_ = stream_device_index; + was_marked_for_recording_ = true; + } + + void DestroyCudaEvent() noexcept { + if (!cuda_event_) { + return; + } + try { + c10::cuda::CUDAGuard guard(device_index_); + C10_CUDA_CHECK(cudaEventDestroy(cuda_event_)); + } catch (...) { + } + cuda_event_ = nullptr; + } +#else + void DestroyCudaEvent() noexcept {} #endif + + void MoveFrom(Event&& other) noexcept { + device_type_ = other.device_type_; + device_index_ = other.device_index_; + flag_ = other.flag_; + was_marked_for_recording_ = other.was_marked_for_recording_; +#ifdef PADDLE_WITH_CUDA + cuda_event_ = std::exchange(other.cuda_event_, nullptr); +#endif + other.device_index_ = -1; + other.was_marked_for_recording_ = false; + } }; } // namespace c10 diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h index e000ca9ac13650..58912130daf303 100644 --- a/paddle/phi/api/include/compat/c10/core/Stream.h +++ b/paddle/phi/api/include/compat/c10/core/Stream.h @@ -106,7 +106,3 @@ struct hash { } }; } // namespace std - -namespace at { -using c10::Stream; -} diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h index a6c1e95b61ef0f..364a634f9abecf 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h @@ -134,6 +134,10 @@ class CUDAStream { Device device() const { return Device(DeviceType::CUDA, device_index()); } + // TODO(youge325): Remove after DeepEP paddle branch is updated to use + // stream() + cudaStream_t raw_stream() const { return stream(); } + private: Stream stream_; }; diff --git a/test/cpp/compat/ATen_record_stream_test.cc b/test/cpp/compat/ATen_record_stream_test.cc index db895005714d0e..8be51d243d1022 100644 --- a/test/cpp/compat/ATen_record_stream_test.cc +++ b/test/cpp/compat/ATen_record_stream_test.cc @@ -47,6 +47,14 @@ class RecordStreamTest : public ::testing::Test { // --- Happy path: CUDA tensor + current CUDA stream should succeed --- #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const; +[[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method = + &at::Tensor::record_stream; + +using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const; +[[maybe_unused]] static RecordRawCudaStreamMethod + g_record_raw_cuda_stream_method = &at::Tensor::record_stream; + TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) { SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto stream = at::cuda::getCurrentCUDAStream(); @@ -60,6 +68,12 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) { c10::Stream default_stream = c10::cuda::getDefaultCUDAStream().unwrap(); EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream)); } + +TEST_F(RecordStreamTest, CudaTensorRawCudaStream) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto stream = at::cuda::getCurrentCUDAStream(); + EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream())); +} #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP // --- Error path: CPU tensor + CPU stream (record_stream does not support CPU diff --git a/test/cpp/compat/CMakeLists.txt b/test/cpp/compat/CMakeLists.txt index 1e02c599f031f6..e771d03fb431ef 100644 --- a/test/cpp/compat/CMakeLists.txt +++ b/test/cpp/compat/CMakeLists.txt @@ -22,6 +22,7 @@ if(NOT WIN32) nv_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc) nv_test(c10_storage_test SRCS c10_storage_test.cc) nv_test(c10_Stream_test SRCS c10_Stream_test.cc) + nv_test(c10_Event_test SRCS c10_Event_test.cc) nv_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc) nv_test(c10_layout_test SRCS c10_layout_test.cc) nv_test(ATen_clamp_test SRCS ATen_clamp_test.cc) diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc new file mode 100644 index 00000000000000..9b350a96280dbd --- /dev/null +++ b/test/cpp/compat/c10_Event_test.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include +#include +#endif + +#include "gtest/gtest.h" +#include "test/cpp/compat/cuda_test_utils.h" + +TEST(EventTest, CpuEventDefaultProperties) { + c10::Event event(c10::DeviceType::CPU); + EXPECT_EQ(event.device_type(), c10::DeviceType::CPU); + EXPECT_EQ(event.device_index(), -1); + EXPECT_EQ(event.flag(), c10::EventFlag::PYTORCH_DEFAULT); + EXPECT_FALSE(event.was_marked_for_recording()); + EXPECT_TRUE(event.query()); + EXPECT_EQ(event.eventId(), nullptr); +} + +TEST(EventTest, CpuEventRecordThrows) { + c10::Event event(c10::DeviceType::CPU); + c10::Stream stream(c10::Stream::DEFAULT, + c10::Device(c10::DeviceType::CPU, 0)); + EXPECT_THROW(event.record(stream), std::exception); + EXPECT_THROW(event.recordOnce(stream), std::exception); +} + +#ifdef PADDLE_WITH_CUDA +using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&); +[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method = + &c10::Event::record; +#endif + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +TEST(EventTest, CudaEventLazyCreateAndRecord) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + c10::Event event(c10::DeviceType::CUDA); + auto stream = c10::cuda::getCurrentCUDAStream(); + + EXPECT_EQ(event.device_index(), -1); + EXPECT_EQ(event.eventId(), nullptr); + EXPECT_FALSE(event.was_marked_for_recording()); + + EXPECT_NO_THROW(event.record(stream.unwrap())); + EXPECT_EQ(event.device_index(), stream.device_index()); + EXPECT_NE(event.eventId(), nullptr); + EXPECT_TRUE(event.was_marked_for_recording()); + + EXPECT_NO_THROW(event.synchronize()); + EXPECT_TRUE(event.query()); +} + +TEST(EventTest, CudaEventElapsedTimeRequiresTimingFlag) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto stream = c10::cuda::getCurrentCUDAStream(); + c10::Event start(c10::DeviceType::CUDA); + c10::Event end(c10::DeviceType::CUDA); + + start.record(stream.unwrap()); + end.record(stream.unwrap()); + end.synchronize(); + + EXPECT_THROW(start.elapsedTime(end), std::exception); +} + +TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto stream = c10::cuda::getCurrentCUDAStream(); + c10::Event start(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); + c10::Event end(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); + + start.record(stream.unwrap()); + end.record(stream.unwrap()); + end.synchronize(); + + double elapsed_ms = -1.0; + EXPECT_NO_THROW(elapsed_ms = start.elapsedTime(end)); + EXPECT_GE(elapsed_ms, 0.0); +} + +TEST(EventTest, CudaEventRawStreamRecordCompatibility) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto stream = c10::cuda::getCurrentCUDAStream(); + c10::Event event(c10::DeviceType::CUDA); + EXPECT_NO_THROW(event.record(stream.raw_stream())); + EXPECT_EQ(event.device_index(), stream.device_index()); + EXPECT_TRUE(event.was_marked_for_recording()); +} + +TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + if (c10::cuda::device_count() < 2) { + return; + } + + c10::Event event(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); + auto stream0 = c10::cuda::getDefaultCUDAStream(0); + auto stream1 = c10::cuda::getDefaultCUDAStream(1); + + EXPECT_NO_THROW(event.record(stream0.unwrap())); + EXPECT_THROW(event.record(stream1.unwrap()), std::exception); +} +#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP From 852d525ac182537aa60ea9b098f1edacb49f51d6 Mon Sep 17 00:00:00 2001 From: youge325 Date: Thu, 2 Apr 2026 16:28:06 +0800 Subject: [PATCH 8/8] complement macro --- .../phi/api/include/compat/c10/core/Event.h | 145 +++++++++++++----- test/cpp/compat/c10_Event_test.cc | 16 +- 2 files changed, 114 insertions(+), 47 deletions(-) diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h index 00919c7f13e300..672fb7f496f907 100644 --- a/paddle/phi/api/include/compat/c10/core/Event.h +++ b/paddle/phi/api/include/compat/c10/core/Event.h @@ -20,7 +20,7 @@ #include -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include #include #endif @@ -31,6 +31,16 @@ enum class EventFlag { PYTORCH_DEFAULT, BACKEND_DEFAULT, INVALID }; struct Event final { public: +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP + using BackendEvent = hipEvent_t; + using BackendStream = hipStream_t; +#else + using BackendEvent = cudaEvent_t; + using BackendStream = cudaStream_t; +#endif +#endif + Event() = delete; Event(const DeviceType device_type, const EventFlag flag = EventFlag::PYTORCH_DEFAULT) @@ -42,13 +52,13 @@ struct Event final { Event(Event&& other) noexcept { MoveFrom(std::move(other)); } Event& operator=(Event&& other) noexcept { if (this != &other) { - DestroyCudaEvent(); + DestroyBackendEvent(); MoveFrom(std::move(other)); } return *this; } - ~Event() { DestroyCudaEvent(); } + ~Event() { DestroyBackendEvent(); } Device device() const noexcept { return Device(device_type_, device_index_); } DeviceType device_type() const noexcept { return device_type_; } @@ -71,26 +81,28 @@ struct Event final { " does not match recording stream's device type ", stream.device_type(), "."); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (device_type_ == DeviceType::CUDA) { - RecordCudaEvent(static_cast(stream.native_handle()), - stream.device_index()); + RecordBackendEvent(static_cast(stream.native_handle()), + stream.device_index()); return; } #endif TORCH_CHECK(false, "Backend doesn't support events."); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); } +#endif +#ifdef PADDLE_WITH_CUDA // TODO(youge325): Remove after DeepEP paddle branch is updated to use // c10::Stream void record(const cudaStream_t& stream) { TORCH_CHECK( device_type_ == DeviceType::CUDA, "Raw cudaStream_t recording is only supported for CUDA events."); - RecordCudaEvent(stream, phi::backends::gpu::GetCurrentDeviceId()); + RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId()); } #endif @@ -104,8 +116,8 @@ struct Event final { " does not match blocking stream's device type ", stream.device_type(), "."); -#ifdef PADDLE_WITH_CUDA - if (device_type_ == DeviceType::CUDA && cuda_event_) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_type_ == DeviceType::CUDA && backend_event_) { TORCH_CHECK(device_index_ == stream.device_index(), "Event device index ", static_cast(device_index_), @@ -113,8 +125,17 @@ struct Event final { static_cast(stream.device_index()), "."); c10::cuda::CUDAGuard guard(device_index_); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipStreamWaitEvent(static_cast(stream.native_handle()), + backend_event_, + 0)); +#else C10_CUDA_CHECK(cudaStreamWaitEvent( - static_cast(stream.native_handle()), cuda_event_, 0)); + static_cast(stream.native_handle()), + backend_event_, + 0)); +#endif return; } #endif @@ -125,9 +146,20 @@ struct Event final { if (!was_marked_for_recording_) { return true; } -#ifdef PADDLE_WITH_CUDA - if (device_type_ == DeviceType::CUDA && cuda_event_) { - const auto err = cudaEventQuery(cuda_event_); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_type_ == DeviceType::CUDA && backend_event_) { +#ifdef PADDLE_WITH_HIP + const auto err = hipEventQuery(backend_event_); + if (err == hipSuccess) { + return true; + } + if (err != hipErrorNotReady) { + PADDLE_ENFORCE_GPU_SUCCESS(err); + } else { + (void)hipGetLastError(); + } +#else + const auto err = cudaEventQuery(backend_event_); if (err == cudaSuccess) { return true; } @@ -136,6 +168,7 @@ struct Event final { } else { (void)cudaGetLastError(); } +#endif return false; } #endif @@ -160,8 +193,9 @@ struct Event final { TORCH_CHECK( query() && event.query(), "Both events must be completed before calculating elapsed time."); -#ifdef PADDLE_WITH_CUDA - if (device_type_ == DeviceType::CUDA && cuda_event_ && event.cuda_event_) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_type_ == DeviceType::CUDA && backend_event_ && + event.backend_event_) { TORCH_CHECK(device_index_ == event.device_index_, "Event device index ", static_cast(device_index_), @@ -170,8 +204,13 @@ struct Event final { "."); c10::cuda::CUDAGuard guard(device_index_); float time_ms = 0.0f; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipEventElapsedTime(&time_ms, backend_event_, event.backend_event_)); +#else C10_CUDA_CHECK( - cudaEventElapsedTime(&time_ms, cuda_event_, event.cuda_event_)); + cudaEventElapsedTime(&time_ms, backend_event_, event.backend_event_)); +#endif return static_cast(time_ms); } #endif @@ -180,8 +219,8 @@ struct Event final { } void* eventId() const { -#ifdef PADDLE_WITH_CUDA - return cuda_event_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + return backend_event_; #else return nullptr; #endif @@ -191,9 +230,13 @@ struct Event final { if (!was_marked_for_recording_) { return; } -#ifdef PADDLE_WITH_CUDA - if (device_type_ == DeviceType::CUDA && cuda_event_) { - C10_CUDA_CHECK(cudaEventSynchronize(cuda_event_)); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_type_ == DeviceType::CUDA && backend_event_) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(backend_event_)); +#else + C10_CUDA_CHECK(cudaEventSynchronize(backend_event_)); +#endif return; } #endif @@ -201,7 +244,7 @@ struct Event final { } #ifdef PADDLE_WITH_CUDA - cudaEvent_t cuda_event() const { return cuda_event_; } + cudaEvent_t cuda_event() const { return backend_event_; } #endif private: @@ -209,56 +252,78 @@ struct Event final { DeviceIndex device_index_ = -1; EventFlag flag_ = EventFlag::PYTORCH_DEFAULT; bool was_marked_for_recording_ = false; -#ifdef PADDLE_WITH_CUDA - cudaEvent_t cuda_event_ = nullptr; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + BackendEvent backend_event_ = nullptr; - static unsigned int CudaEventCreateFlags(EventFlag flag) { + static unsigned int BackendEventCreateFlags(EventFlag flag) { switch (flag) { case EventFlag::PYTORCH_DEFAULT: +#ifdef PADDLE_WITH_HIP + return hipEventDisableTiming; +#else return cudaEventDisableTiming; +#endif case EventFlag::BACKEND_DEFAULT: +#ifdef PADDLE_WITH_HIP + return hipEventDefault; +#else return cudaEventDefault; +#endif default: TORCH_CHECK(false, "CUDA event received unknown flag"); } } - void EnsureCudaEventCreated(DeviceIndex stream_device_index) { - if (cuda_event_) { + void EnsureBackendEventCreated(DeviceIndex stream_device_index) { + if (backend_event_) { return; } c10::cuda::CUDAGuard guard(stream_device_index); - C10_CUDA_CHECK( - cudaEventCreateWithFlags(&cuda_event_, CudaEventCreateFlags(flag_))); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags( + &backend_event_, BackendEventCreateFlags(flag_))); +#else + C10_CUDA_CHECK(cudaEventCreateWithFlags(&backend_event_, + BackendEventCreateFlags(flag_))); +#endif } - void RecordCudaEvent(cudaStream_t stream, DeviceIndex stream_device_index) { + void RecordBackendEvent(BackendStream stream, + DeviceIndex stream_device_index) { TORCH_CHECK(device_index_ == -1 || device_index_ == stream_device_index, "Event device index ", static_cast(device_index_), " does not match recording stream's device index ", static_cast(stream_device_index), "."); - EnsureCudaEventCreated(stream_device_index); + EnsureBackendEventCreated(stream_device_index); c10::cuda::CUDAGuard guard(stream_device_index); - C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream)); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(backend_event_, stream)); +#else + C10_CUDA_CHECK(cudaEventRecord(backend_event_, stream)); +#endif device_index_ = stream_device_index; was_marked_for_recording_ = true; } - void DestroyCudaEvent() noexcept { - if (!cuda_event_) { + void DestroyBackendEvent() noexcept { + if (!backend_event_) { return; } try { c10::cuda::CUDAGuard guard(device_index_); - C10_CUDA_CHECK(cudaEventDestroy(cuda_event_)); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(backend_event_)); +#else + C10_CUDA_CHECK(cudaEventDestroy(backend_event_)); +#endif } catch (...) { } - cuda_event_ = nullptr; + backend_event_ = nullptr; } #else - void DestroyCudaEvent() noexcept {} + void DestroyBackendEvent() noexcept {} #endif void MoveFrom(Event&& other) noexcept { @@ -266,8 +331,8 @@ struct Event final { device_index_ = other.device_index_; flag_ = other.flag_; was_marked_for_recording_ = other.was_marked_for_recording_; -#ifdef PADDLE_WITH_CUDA - cuda_event_ = std::exchange(other.cuda_event_, nullptr); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + backend_event_ = std::exchange(other.backend_event_, nullptr); #endif other.device_index_ = -1; other.was_marked_for_recording_ = false; diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc index 9b350a96280dbd..a6933e06efea36 100644 --- a/test/cpp/compat/c10_Event_test.cc +++ b/test/cpp/compat/c10_Event_test.cc @@ -56,7 +56,7 @@ TEST(EventTest, CudaEventLazyCreateAndRecord) { EXPECT_EQ(event.eventId(), nullptr); EXPECT_FALSE(event.was_marked_for_recording()); - EXPECT_NO_THROW(event.record(stream.unwrap())); + EXPECT_NO_THROW(event.record(stream)); EXPECT_EQ(event.device_index(), stream.device_index()); EXPECT_NE(event.eventId(), nullptr); EXPECT_TRUE(event.was_marked_for_recording()); @@ -71,8 +71,8 @@ TEST(EventTest, CudaEventElapsedTimeRequiresTimingFlag) { c10::Event start(c10::DeviceType::CUDA); c10::Event end(c10::DeviceType::CUDA); - start.record(stream.unwrap()); - end.record(stream.unwrap()); + start.record(stream); + end.record(stream); end.synchronize(); EXPECT_THROW(start.elapsedTime(end), std::exception); @@ -84,8 +84,8 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) { c10::Event start(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); c10::Event end(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); - start.record(stream.unwrap()); - end.record(stream.unwrap()); + start.record(stream); + end.record(stream); end.synchronize(); double elapsed_ms = -1.0; @@ -93,6 +93,7 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) { EXPECT_GE(elapsed_ms, 0.0); } +#ifdef PADDLE_WITH_CUDA TEST(EventTest, CudaEventRawStreamRecordCompatibility) { SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto stream = c10::cuda::getCurrentCUDAStream(); @@ -101,6 +102,7 @@ TEST(EventTest, CudaEventRawStreamRecordCompatibility) { EXPECT_EQ(event.device_index(), stream.device_index()); EXPECT_TRUE(event.was_marked_for_recording()); } +#endif TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) { SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); @@ -112,7 +114,7 @@ TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) { auto stream0 = c10::cuda::getDefaultCUDAStream(0); auto stream1 = c10::cuda::getDefaultCUDAStream(1); - EXPECT_NO_THROW(event.record(stream0.unwrap())); - EXPECT_THROW(event.record(stream1.unwrap()), std::exception); + EXPECT_NO_THROW(event.record(stream0)); + EXPECT_THROW(event.record(stream1), std::exception); } #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP