PaddlePaddle · youge325 · Mar 24, 2026 · Mar 25, 2026 · Mar 25, 2026 · Apr 3, 2026
diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
@@ -689,9 +689,6 @@ class Tensor : public TensorBase {
   void record_stream(at::Stream s) const;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void record_stream(at::cuda::CUDAStream s) const;
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // at::Stream
-  void record_stream(cudaStream_t s) const;
 #endif
 
   Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); }

diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
@@ -51,20 +51,8 @@ inline void Tensor::record_stream(at::Stream s) const {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 inline void Tensor::record_stream(at::cuda::CUDAStream s) const {
-  record_stream(static_cast<at::Stream>(s));
-}
-
-// TODO(youge325): Remove after DeepEP paddle branch is updated to use
-// at::Stream
-inline void Tensor::record_stream(cudaStream_t s) const {
-  auto dense_tensor =
-      std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
-  PD_CHECK(dense_tensor != nullptr,
-           "record_stream only supports DenseTensor, but got a non-dense "
-           "tensor implementation.");
-  PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
-           "record_stream is not supported for CPU tensors.");
-  paddle::memory::RecordStream(dense_tensor->Holder(), s);
+  record_stream(s.unwrap());
 }
 #endif
+
 }  // namespace at
diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -95,17 +95,6 @@ struct Event final {
   void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); }
 #endif
 
-#ifdef PADDLE_WITH_CUDA
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // c10::Stream
-  void record(const cudaStream_t& stream) {
-    TORCH_CHECK(
-        device_type_ == DeviceType::CUDA,
-        "Raw cudaStream_t recording is only supported for CUDA events.");
-    RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
-  }
-#endif
-
   void block(const Stream& stream) const {
     if (!was_marked_for_recording_) {
       return;

diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h
@@ -105,3 +105,7 @@ struct hash<c10::Stream> {
   }
 };
 }  // namespace std
+
+namespace at {
+using c10::Stream;
+}
-
-namespace at {
-using c10::Stream;
-}
-
-namespace at {
-using c10::Stream;
-}
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
@@ -150,8 +150,6 @@ class CUDAStream {
 
   Device device() const { return Device(DeviceType::CUDA, device_index()); }
 
-  cudaStream_t raw_stream() const { return stream(); }
-
   struct c10::StreamData3 pack3() const {
     return stream_.pack3();
   }
@@ -219,7 +217,6 @@ inline CUDAStream getStreamFromPool(const int priority,
   });
 
   cudaStream_t raw;
-
   // Keep parity with PyTorch API shape: negative priority selects the
   // high-priority pool, non-negative selects the low-priority pool.
   if (priority < 0) {

diff --git a/test/cpp/compat/ATen_record_stream_test.cc b/test/cpp/compat/ATen_record_stream_test.cc
@@ -51,10 +51,6 @@ using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const;
 [[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method =
     &at::Tensor::record_stream;
 
-using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const;
-[[maybe_unused]] static RecordRawCudaStreamMethod
-    g_record_raw_cuda_stream_method = &at::Tensor::record_stream;
-
 TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) {
   SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   auto stream = at::cuda::getCurrentCUDAStream();
@@ -69,11 +65,6 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) {
   EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream));
 }
 
-TEST_F(RecordStreamTest, CudaTensorRawCudaStream) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
-  auto stream = at::cuda::getCurrentCUDAStream();
-  EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream()));
-}
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
 
 // --- Error path: CPU tensor + CPU stream (record_stream does not support CPU

diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc
@@ -40,12 +40,6 @@ TEST(EventTest, CpuEventRecordThrows) {
   EXPECT_THROW(event.recordOnce(stream), std::exception);
 }
 
-#ifdef PADDLE_WITH_CUDA
-using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&);
-[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method =
-    &c10::Event::record;
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(EventTest, CudaEventLazyCreateAndRecord) {
   SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
@@ -93,17 +87,6 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) {
   EXPECT_GE(elapsed_ms, 0.0);
 }
 
-#ifdef PADDLE_WITH_CUDA
-TEST(EventTest, CudaEventRawStreamRecordCompatibility) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
-  auto stream = c10::cuda::getCurrentCUDAStream();
-  c10::Event event(c10::DeviceType::CUDA);
-  EXPECT_NO_THROW(event.record(stream.raw_stream()));
-  EXPECT_EQ(event.device_index(), stream.device_index());
-  EXPECT_TRUE(event.was_marked_for_recording());
-}
-#endif
-
 TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) {
   SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   if (c10::cuda::device_count() < 2) {