diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h index 5d2c2d10b710d5..672fb7f496f907 100644 --- a/paddle/phi/api/include/compat/c10/core/Event.h +++ b/paddle/phi/api/include/compat/c10/core/Event.h @@ -16,141 +16,327 @@ #include #include +#include + +#include + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include #include -#include +#endif + namespace c10 { -/** - * A backend-generic movable, not copyable, not thread-safe event. - * - * The design of this event follows that of CUDA and HIP events. These events - * are recorded and waited on by streams and can be rerecorded to, - * each rerecording essentially creating a new version of the event. - * For example, if (in CPU time), stream X is asked to record E, - * stream Y waits on E, and stream X is asked to record E again, then Y will - * wait for X to finish the first call to record and not the second, because - * it's waiting on the first version of event E, not the second. - * Querying an event only returns the status of its most recent version. - * - * Backend-generic events are implemented by this class and - * impl::InlineEvent. In addition to these events there are also - * some backend-specific events, like ATen's CUDAEvent. Each of these - * classes has its own use. - * - * impl::InlineEvent<...> or a backend-specific event should be - * preferred when the backend is known at compile time and known to - * be compiled. Backend-specific events may have additional functionality. - * - * This Event should be used if a particular backend may not be available, - * or the backend required is not known at compile time. - * - * These generic events are built on top of DeviceGuardImpls, analogous - * to DeviceGuard and InlineDeviceGuard. The name "DeviceGuardImpls," - * is no longer entirely accurate, as these classes implement the - * backend-specific logic for a generic backend interface. - * - * See DeviceGuardImplInterface.h for a list of all supported flags. - */ +enum class EventFlag { PYTORCH_DEFAULT, BACKEND_DEFAULT, INVALID }; -#ifdef PADDLE_WITH_CUDA - -class EventPool { +struct Event final { public: - EventPool(); - EventPool(const EventPool &) = delete; - EventPool(EventPool &&) = delete; - ~EventPool(); - - cudaEvent_t CreateCudaEventFromPool(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP + using BackendEvent = hipEvent_t; + using BackendStream = hipStream_t; +#else + using BackendEvent = cudaEvent_t; + using BackendStream = cudaStream_t; +#endif +#endif - static EventPool &Instance(); + Event() = delete; + Event(const DeviceType device_type, + const EventFlag flag = EventFlag::PYTORCH_DEFAULT) + : device_type_(device_type), flag_(flag) {} - private: - std::queue incomplished_events_; - std::mutex mtx_; -}; + Event(const Event&) = delete; + Event& operator=(const Event&) = delete; -EventPool &EventPool::Instance() { - static EventPool pool; - return pool; -} + Event(Event&& other) noexcept { MoveFrom(std::move(other)); } + Event& operator=(Event&& other) noexcept { + if (this != &other) { + DestroyBackendEvent(); + MoveFrom(std::move(other)); + } + return *this; + } -EventPool::EventPool() { - for (size_t i = 0; i < 1000; ++i) { - cudaEvent_t new_event; - C10_CUDA_CHECK(cudaEventCreate(&new_event)); + ~Event() { DestroyBackendEvent(); } - cudaEventRecord(new_event, 0); - incomplished_events_.push(new_event); + Device device() const noexcept { return Device(device_type_, device_index_); } + DeviceType device_type() const noexcept { return device_type_; } + DeviceIndex device_index() const noexcept { return device_index_; } + EventFlag flag() const noexcept { return flag_; } + bool was_marked_for_recording() const noexcept { + return was_marked_for_recording_; } -} - -EventPool::~EventPool() { - std::unique_lock lock(mtx_); - while (!incomplished_events_.empty()) { - cudaEvent_t event = incomplished_events_.front(); - incomplished_events_.pop(); - if (cudaEventQuery(event) == cudaSuccess) { - C10_CUDA_CHECK(cudaEventDestroy(event)); + + void recordOnce(const Stream& stream) { + if (!was_marked_for_recording_) { + record(stream); } } -} - -cudaEvent_t EventPool::CreateCudaEventFromPool() { - std::unique_lock lock(mtx_); - - const auto &CreateNewEvent = [&]() -> cudaEvent_t { - cudaEvent_t new_event; - C10_CUDA_CHECK(cudaEventCreate(&new_event)); - incomplished_events_.push(new_event); - return new_event; - }; - - const auto &CreateNewOrReuseEvent = [&]() -> cudaEvent_t { - cudaEvent_t front_event = incomplished_events_.front(); - incomplished_events_.pop(); - incomplished_events_.push(front_event); - if (cudaEventQuery(front_event) == cudaSuccess) { - return front_event; + + void record(const Stream& stream) { + TORCH_CHECK(stream.device_type() == device_type_, + "Event device type ", + device_type_, + " does not match recording stream's device type ", + stream.device_type(), + "."); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_type_ == DeviceType::CUDA) { + RecordBackendEvent(static_cast(stream.native_handle()), + stream.device_index()); + return; } - return CreateNewEvent(); - }; +#endif + TORCH_CHECK(false, "Backend doesn't support events."); + } - if (incomplished_events_.empty()) { - return CreateNewEvent(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); } +#endif + +#ifdef PADDLE_WITH_CUDA + // TODO(youge325): Remove after DeepEP paddle branch is updated to use + // c10::Stream + void record(const cudaStream_t& stream) { + TORCH_CHECK( + device_type_ == DeviceType::CUDA, + "Raw cudaStream_t recording is only supported for CUDA events."); + RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId()); } - return CreateNewOrReuseEvent(); -} +#endif -struct Event final { - public: - Event(const DeviceType &type) { - // device_type is useless, only for compatibility - cuda_event_ = EventPool::Instance().CreateCudaEventFromPool(); + void block(const Stream& stream) const { + if (!was_marked_for_recording_) { + return; + } + TORCH_CHECK(stream.device_type() == device_type_, + "Event device type ", + device_type_, + " does not match blocking stream's device type ", + stream.device_type(), + "."); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_type_ == DeviceType::CUDA && backend_event_) { + TORCH_CHECK(device_index_ == stream.device_index(), + "Event device index ", + static_cast(device_index_), + " does not match blocking stream's device index ", + static_cast(stream.device_index()), + "."); + c10::cuda::CUDAGuard guard(device_index_); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipStreamWaitEvent(static_cast(stream.native_handle()), + backend_event_, + 0)); +#else + C10_CUDA_CHECK(cudaStreamWaitEvent( + static_cast(stream.native_handle()), + backend_event_, + 0)); +#endif + return; + } +#endif + TORCH_CHECK(false, "Backend doesn't support events."); } - void record(const Stream &stream) { - C10_CUDA_CHECK(cudaEventRecord( - cuda_event_, static_cast(stream.native_handle()))); + bool query() const { + if (!was_marked_for_recording_) { + return true; + } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_type_ == DeviceType::CUDA && backend_event_) { +#ifdef PADDLE_WITH_HIP + const auto err = hipEventQuery(backend_event_); + if (err == hipSuccess) { + return true; + } + if (err != hipErrorNotReady) { + PADDLE_ENFORCE_GPU_SUCCESS(err); + } else { + (void)hipGetLastError(); + } +#else + const auto err = cudaEventQuery(backend_event_); + if (err == cudaSuccess) { + return true; + } + if (err != cudaErrorNotReady) { + C10_CUDA_CHECK(err); + } else { + (void)cudaGetLastError(); + } +#endif + return false; + } +#endif + TORCH_CHECK(false, "Backend doesn't support events."); + return true; } - void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); } + double elapsedTime(const Event& event) const { + TORCH_CHECK(event.device_type() == device_type_, + "Event device type ", + device_type_, + " does not match other's device type ", + event.device_type(), + "."); + TORCH_CHECK( + flag_ == EventFlag::BACKEND_DEFAULT && + event.flag_ == EventFlag::BACKEND_DEFAULT, + "Both events must be created with argument 'enable_timing=True'."); + TORCH_CHECK( + was_marked_for_recording_ && event.was_marked_for_recording_, + "Both events must be recorded before calculating elapsed time."); + TORCH_CHECK( + query() && event.query(), + "Both events must be completed before calculating elapsed time."); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_type_ == DeviceType::CUDA && backend_event_ && + event.backend_event_) { + TORCH_CHECK(device_index_ == event.device_index_, + "Event device index ", + static_cast(device_index_), + " does not match other's device index ", + static_cast(event.device_index_), + "."); + c10::cuda::CUDAGuard guard(device_index_); + float time_ms = 0.0f; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipEventElapsedTime(&time_ms, backend_event_, event.backend_event_)); +#else + C10_CUDA_CHECK( + cudaEventElapsedTime(&time_ms, backend_event_, event.backend_event_)); +#endif + return static_cast(time_ms); + } +#endif + TORCH_CHECK(false, "Backend doesn't support event elapsedTime."); + return 0.0; + } - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // c10::Stream - void record(const cudaStream_t &stream) { - C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream)); + void* eventId() const { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + return backend_event_; +#else + return nullptr; +#endif } - void block(const Stream &stream) const { - C10_CUDA_CHECK(cudaStreamWaitEvent( - static_cast(stream.native_handle()), cuda_event_, 0)); + void synchronize() const { + if (!was_marked_for_recording_) { + return; + } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (device_type_ == DeviceType::CUDA && backend_event_) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(backend_event_)); +#else + C10_CUDA_CHECK(cudaEventSynchronize(backend_event_)); +#endif + return; + } +#endif + TORCH_CHECK(false, "Backend doesn't support events."); } - cudaEvent_t cuda_event() const { return cuda_event_; } +#ifdef PADDLE_WITH_CUDA + cudaEvent_t cuda_event() const { return backend_event_; } +#endif private: - cudaEvent_t cuda_event_; + DeviceType device_type_; + DeviceIndex device_index_ = -1; + EventFlag flag_ = EventFlag::PYTORCH_DEFAULT; + bool was_marked_for_recording_ = false; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + BackendEvent backend_event_ = nullptr; + + static unsigned int BackendEventCreateFlags(EventFlag flag) { + switch (flag) { + case EventFlag::PYTORCH_DEFAULT: +#ifdef PADDLE_WITH_HIP + return hipEventDisableTiming; +#else + return cudaEventDisableTiming; +#endif + case EventFlag::BACKEND_DEFAULT: +#ifdef PADDLE_WITH_HIP + return hipEventDefault; +#else + return cudaEventDefault; +#endif + default: + TORCH_CHECK(false, "CUDA event received unknown flag"); + } + } + + void EnsureBackendEventCreated(DeviceIndex stream_device_index) { + if (backend_event_) { + return; + } + c10::cuda::CUDAGuard guard(stream_device_index); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags( + &backend_event_, BackendEventCreateFlags(flag_))); +#else + C10_CUDA_CHECK(cudaEventCreateWithFlags(&backend_event_, + BackendEventCreateFlags(flag_))); +#endif + } + + void RecordBackendEvent(BackendStream stream, + DeviceIndex stream_device_index) { + TORCH_CHECK(device_index_ == -1 || device_index_ == stream_device_index, + "Event device index ", + static_cast(device_index_), + " does not match recording stream's device index ", + static_cast(stream_device_index), + "."); + EnsureBackendEventCreated(stream_device_index); + c10::cuda::CUDAGuard guard(stream_device_index); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(backend_event_, stream)); +#else + C10_CUDA_CHECK(cudaEventRecord(backend_event_, stream)); +#endif + device_index_ = stream_device_index; + was_marked_for_recording_ = true; + } + + void DestroyBackendEvent() noexcept { + if (!backend_event_) { + return; + } + try { + c10::cuda::CUDAGuard guard(device_index_); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(backend_event_)); +#else + C10_CUDA_CHECK(cudaEventDestroy(backend_event_)); +#endif + } catch (...) { + } + backend_event_ = nullptr; + } +#else + void DestroyBackendEvent() noexcept {} +#endif + + void MoveFrom(Event&& other) noexcept { + device_type_ = other.device_type_; + device_index_ = other.device_index_; + flag_ = other.flag_; + was_marked_for_recording_ = other.was_marked_for_recording_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + backend_event_ = std::exchange(other.backend_event_, nullptr); +#endif + other.device_index_ = -1; + other.was_marked_for_recording_ = false; + } }; } // namespace c10 @@ -158,5 +344,3 @@ struct Event final { namespace torch { using c10::Event; } // namespace torch - -#endif diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h index f88dd043317252..364a634f9abecf 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h @@ -74,7 +74,7 @@ inline StreamPoolState& get_pool(int device_index) { inline void init_pool(int device_index, StreamPoolState* state) { phi::backends::gpu::GPUDeviceGuard guard(device_index); int lo_pri = 0, hi_pri = 0; - cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri); + C10_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri)); for (int i = 0; i < kStreamsPerPool; ++i) { C10_CUDA_CHECK(cudaStreamCreateWithPriority( &state->low_priority[i], cudaStreamNonBlocking, lo_pri)); @@ -192,6 +192,9 @@ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) { * record_stream lifetime semantics. */ inline CUDAStream getStreamFromPool(const bool isHighPriority = false, + c10::DeviceIndex device_index = -1); + +inline CUDAStream getStreamFromPool(const int priority, c10::DeviceIndex device_index = -1) { if (device_index == -1) { device_index = phi::backends::gpu::GetCurrentDeviceId(); @@ -203,7 +206,9 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false, }); cudaStream_t raw; - if (isHighPriority) { + // Keep parity with PyTorch API shape: negative priority selects the + // high-priority pool, non-negative selects the low-priority pool. + if (priority < 0) { raw = state.high_priority[state.hp_counter.fetch_add(1) % detail::kStreamsPerPool]; } else { @@ -213,6 +218,11 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false, return make_cuda_stream(raw, device_index); } +inline CUDAStream getStreamFromPool(const bool isHighPriority, + c10::DeviceIndex device_index) { + return getStreamFromPool(isHighPriority ? -1 : 0, device_index); +} + /** * Set the current CUDA stream for the device of the given stream in the * calling thread. diff --git a/test/cpp/compat/ATen_record_stream_test.cc b/test/cpp/compat/ATen_record_stream_test.cc index db895005714d0e..8be51d243d1022 100644 --- a/test/cpp/compat/ATen_record_stream_test.cc +++ b/test/cpp/compat/ATen_record_stream_test.cc @@ -47,6 +47,14 @@ class RecordStreamTest : public ::testing::Test { // --- Happy path: CUDA tensor + current CUDA stream should succeed --- #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const; +[[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method = + &at::Tensor::record_stream; + +using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const; +[[maybe_unused]] static RecordRawCudaStreamMethod + g_record_raw_cuda_stream_method = &at::Tensor::record_stream; + TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) { SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto stream = at::cuda::getCurrentCUDAStream(); @@ -60,6 +68,12 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) { c10::Stream default_stream = c10::cuda::getDefaultCUDAStream().unwrap(); EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream)); } + +TEST_F(RecordStreamTest, CudaTensorRawCudaStream) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto stream = at::cuda::getCurrentCUDAStream(); + EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream())); +} #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP // --- Error path: CPU tensor + CPU stream (record_stream does not support CPU diff --git a/test/cpp/compat/CMakeLists.txt b/test/cpp/compat/CMakeLists.txt index 1e02c599f031f6..e771d03fb431ef 100644 --- a/test/cpp/compat/CMakeLists.txt +++ b/test/cpp/compat/CMakeLists.txt @@ -22,6 +22,7 @@ if(NOT WIN32) nv_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc) nv_test(c10_storage_test SRCS c10_storage_test.cc) nv_test(c10_Stream_test SRCS c10_Stream_test.cc) + nv_test(c10_Event_test SRCS c10_Event_test.cc) nv_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc) nv_test(c10_layout_test SRCS c10_layout_test.cc) nv_test(ATen_clamp_test SRCS ATen_clamp_test.cc) diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc new file mode 100644 index 00000000000000..a6933e06efea36 --- /dev/null +++ b/test/cpp/compat/c10_Event_test.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include +#include +#endif + +#include "gtest/gtest.h" +#include "test/cpp/compat/cuda_test_utils.h" + +TEST(EventTest, CpuEventDefaultProperties) { + c10::Event event(c10::DeviceType::CPU); + EXPECT_EQ(event.device_type(), c10::DeviceType::CPU); + EXPECT_EQ(event.device_index(), -1); + EXPECT_EQ(event.flag(), c10::EventFlag::PYTORCH_DEFAULT); + EXPECT_FALSE(event.was_marked_for_recording()); + EXPECT_TRUE(event.query()); + EXPECT_EQ(event.eventId(), nullptr); +} + +TEST(EventTest, CpuEventRecordThrows) { + c10::Event event(c10::DeviceType::CPU); + c10::Stream stream(c10::Stream::DEFAULT, + c10::Device(c10::DeviceType::CPU, 0)); + EXPECT_THROW(event.record(stream), std::exception); + EXPECT_THROW(event.recordOnce(stream), std::exception); +} + +#ifdef PADDLE_WITH_CUDA +using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&); +[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method = + &c10::Event::record; +#endif + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +TEST(EventTest, CudaEventLazyCreateAndRecord) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + c10::Event event(c10::DeviceType::CUDA); + auto stream = c10::cuda::getCurrentCUDAStream(); + + EXPECT_EQ(event.device_index(), -1); + EXPECT_EQ(event.eventId(), nullptr); + EXPECT_FALSE(event.was_marked_for_recording()); + + EXPECT_NO_THROW(event.record(stream)); + EXPECT_EQ(event.device_index(), stream.device_index()); + EXPECT_NE(event.eventId(), nullptr); + EXPECT_TRUE(event.was_marked_for_recording()); + + EXPECT_NO_THROW(event.synchronize()); + EXPECT_TRUE(event.query()); +} + +TEST(EventTest, CudaEventElapsedTimeRequiresTimingFlag) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto stream = c10::cuda::getCurrentCUDAStream(); + c10::Event start(c10::DeviceType::CUDA); + c10::Event end(c10::DeviceType::CUDA); + + start.record(stream); + end.record(stream); + end.synchronize(); + + EXPECT_THROW(start.elapsedTime(end), std::exception); +} + +TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto stream = c10::cuda::getCurrentCUDAStream(); + c10::Event start(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); + c10::Event end(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); + + start.record(stream); + end.record(stream); + end.synchronize(); + + double elapsed_ms = -1.0; + EXPECT_NO_THROW(elapsed_ms = start.elapsedTime(end)); + EXPECT_GE(elapsed_ms, 0.0); +} + +#ifdef PADDLE_WITH_CUDA +TEST(EventTest, CudaEventRawStreamRecordCompatibility) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto stream = c10::cuda::getCurrentCUDAStream(); + c10::Event event(c10::DeviceType::CUDA); + EXPECT_NO_THROW(event.record(stream.raw_stream())); + EXPECT_EQ(event.device_index(), stream.device_index()); + EXPECT_TRUE(event.was_marked_for_recording()); +} +#endif + +TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + if (c10::cuda::device_count() < 2) { + return; + } + + c10::Event event(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); + auto stream0 = c10::cuda::getDefaultCUDAStream(0); + auto stream1 = c10::cuda::getDefaultCUDAStream(1); + + EXPECT_NO_THROW(event.record(stream0)); + EXPECT_THROW(event.record(stream1), std::exception); +} +#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP