diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h index a12dfba80b5bec..4ec281953f48b1 100644 --- a/paddle/phi/api/include/compat/ATen/core/TensorBody.h +++ b/paddle/phi/api/include/compat/ATen/core/TensorBody.h @@ -689,9 +689,6 @@ class Tensor : public TensorBase { void record_stream(at::Stream s) const; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void record_stream(at::cuda::CUDAStream s) const; - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // at::Stream - void record_stream(cudaStream_t s) const; #endif Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); } diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h index 73cb5dd4b2247c..7bf27b7cc6ee20 100644 --- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h +++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h @@ -51,20 +51,8 @@ inline void Tensor::record_stream(at::Stream s) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) inline void Tensor::record_stream(at::cuda::CUDAStream s) const { - record_stream(static_cast(s)); -} - -// TODO(youge325): Remove after DeepEP paddle branch is updated to use -// at::Stream -inline void Tensor::record_stream(cudaStream_t s) const { - auto dense_tensor = - std::dynamic_pointer_cast(tensor_.impl()); - PD_CHECK(dense_tensor != nullptr, - "record_stream only supports DenseTensor, but got a non-dense " - "tensor implementation."); - PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU, - "record_stream is not supported for CPU tensors."); - paddle::memory::RecordStream(dense_tensor->Holder(), s); + record_stream(s.unwrap()); } #endif + } // namespace at diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h index 672fb7f496f907..95894501d6a6ce 100644 --- a/paddle/phi/api/include/compat/c10/core/Event.h +++ b/paddle/phi/api/include/compat/c10/core/Event.h @@ -95,17 +95,6 @@ struct Event final { void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); } #endif -#ifdef PADDLE_WITH_CUDA - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // c10::Stream - void record(const cudaStream_t& stream) { - TORCH_CHECK( - device_type_ == DeviceType::CUDA, - "Raw cudaStream_t recording is only supported for CUDA events."); - RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId()); - } -#endif - void block(const Stream& stream) const { if (!was_marked_for_recording_) { return; diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h index e9bcbc939d9215..f68e863eb931dd 100644 --- a/paddle/phi/api/include/compat/c10/core/Stream.h +++ b/paddle/phi/api/include/compat/c10/core/Stream.h @@ -105,3 +105,7 @@ struct hash { } }; } // namespace std + +namespace at { +using c10::Stream; +} diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h index 93ad59cfac8c5a..f96a3fc44a3194 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h @@ -150,8 +150,6 @@ class CUDAStream { Device device() const { return Device(DeviceType::CUDA, device_index()); } - cudaStream_t raw_stream() const { return stream(); } - struct c10::StreamData3 pack3() const { return stream_.pack3(); } @@ -219,7 +217,6 @@ inline CUDAStream getStreamFromPool(const int priority, }); cudaStream_t raw; - // Keep parity with PyTorch API shape: negative priority selects the // high-priority pool, non-negative selects the low-priority pool. if (priority < 0) { diff --git a/test/cpp/compat/ATen_record_stream_test.cc b/test/cpp/compat/ATen_record_stream_test.cc index 8be51d243d1022..1f37cbcebab380 100644 --- a/test/cpp/compat/ATen_record_stream_test.cc +++ b/test/cpp/compat/ATen_record_stream_test.cc @@ -51,10 +51,6 @@ using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const; [[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method = &at::Tensor::record_stream; -using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const; -[[maybe_unused]] static RecordRawCudaStreamMethod - g_record_raw_cuda_stream_method = &at::Tensor::record_stream; - TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) { SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto stream = at::cuda::getCurrentCUDAStream(); @@ -69,11 +65,6 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) { EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream)); } -TEST_F(RecordStreamTest, CudaTensorRawCudaStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); - auto stream = at::cuda::getCurrentCUDAStream(); - EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream())); -} #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP // --- Error path: CPU tensor + CPU stream (record_stream does not support CPU diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc index a6933e06efea36..e318ab8e4a235d 100644 --- a/test/cpp/compat/c10_Event_test.cc +++ b/test/cpp/compat/c10_Event_test.cc @@ -40,12 +40,6 @@ TEST(EventTest, CpuEventRecordThrows) { EXPECT_THROW(event.recordOnce(stream), std::exception); } -#ifdef PADDLE_WITH_CUDA -using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&); -[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method = - &c10::Event::record; -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(EventTest, CudaEventLazyCreateAndRecord) { SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); @@ -93,17 +87,6 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) { EXPECT_GE(elapsed_ms, 0.0); } -#ifdef PADDLE_WITH_CUDA -TEST(EventTest, CudaEventRawStreamRecordCompatibility) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); - auto stream = c10::cuda::getCurrentCUDAStream(); - c10::Event event(c10::DeviceType::CUDA); - EXPECT_NO_THROW(event.record(stream.raw_stream())); - EXPECT_EQ(event.device_index(), stream.device_index()); - EXPECT_TRUE(event.was_marked_for_recording()); -} -#endif - TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) { SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); if (c10::cuda::device_count() < 2) {