From 2e986b3497458b6531d948a80fde39cade86ea09 Mon Sep 17 00:00:00 2001 From: youge325 Date: Sat, 28 Mar 2026 16:51:22 +0800 Subject: [PATCH 1/8] fix --- test/cpp/compat/ATen_TensorAccessor_test.cc | 2 +- test/cpp/compat/c10_storage_test.cc | 9 +++++---- test/cpp/compat/compat_basic_test.cc | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/test/cpp/compat/ATen_TensorAccessor_test.cc b/test/cpp/compat/ATen_TensorAccessor_test.cc index cb1eaaf3c8add0..4d6b8e9648bcfa 100644 --- a/test/cpp/compat/ATen_TensorAccessor_test.cc +++ b/test/cpp/compat/ATen_TensorAccessor_test.cc @@ -198,7 +198,7 @@ TEST(TensorAccessorTest, PackedAccessorWithIntType) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(TensorAccessorTest, PackedAccessorCUDA) { - if (at::cuda::is_available()) { + if (torch::cuda::is_available()) { // Create CUDA tensor at::Tensor tensor = at::arange(12, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)) diff --git a/test/cpp/compat/c10_storage_test.cc b/test/cpp/compat/c10_storage_test.cc index f05083fe88747f..d94dee0d47b605 100644 --- a/test/cpp/compat/c10_storage_test.cc +++ b/test/cpp/compat/c10_storage_test.cc @@ -26,8 +26,9 @@ #include "paddle/phi/backends/gpu/gpu_info.h" // Forward-declare getCUDADeviceAllocator to avoid include-order conflicts -// between ATen/cuda/CUDAContextLight.h (defines at::cuda::is_available inline) -// and torch/cuda.h (adds `using torch::cuda::is_available` to at::cuda). +// between ATen/cuda/CUDAContextLight.h (defines torch::cuda::is_available +// inline) and torch/cuda.h (adds `using torch::cuda::is_available` to +// at::cuda). namespace at::cuda { c10::Allocator* getCUDADeviceAllocator(); } // namespace at::cuda @@ -242,7 +243,7 @@ TEST(StorageTest, DeviceAndDeviceTypeAPIs) { ASSERT_EQ(place.GetType(), phi::AllocationType::CPU); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (at::cuda::is_available()) { + if (torch::cuda::is_available()) { at::TensorBase cuda_tensor = at::ones( {2, 3}, c10::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); const c10::Storage& cuda_storage = cuda_tensor.storage(); @@ -1041,7 +1042,7 @@ TEST(StorageTest, ReferenceSemanticsSetNbytesVisibleThroughCopy) { TEST(StorageTest, CUDAAllocatorZeroBytePreservesDevice) { // getCUDADeviceAllocator()->allocate(0) must return a DataPtr whose device // is the current CUDA device, not a default-constructed CPU DataPtr. - if (!at::cuda::is_available()) { + if (!torch::cuda::is_available()) { return; // No CUDA device, skip } diff --git a/test/cpp/compat/compat_basic_test.cc b/test/cpp/compat/compat_basic_test.cc index d6f33869449003..232a9fd66e8f7c 100644 --- a/test/cpp/compat/compat_basic_test.cc +++ b/test/cpp/compat/compat_basic_test.cc @@ -231,7 +231,7 @@ TEST(compat_basic_test, BasicCase) { TEST(TestDevice, DeviceAPIsOnCUDA) { // Test device related APIs on CUDA if available #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (at::cuda::is_available()) { + if (torch::cuda::is_available()) { at::TensorBase cuda_tensor = at::ones( {2, 3}, c10::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); From 177c6dcd3cbce820e4001869c77e96ca2ea4b427 Mon Sep 17 00:00:00 2001 From: youge325 Date: Sat, 28 Mar 2026 20:22:18 +0800 Subject: [PATCH 2/8] align some APIs --- .../phi/api/include/compat/ATen/core/ivalue.h | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/paddle/phi/api/include/compat/ATen/core/ivalue.h b/paddle/phi/api/include/compat/ATen/core/ivalue.h index d22e19c4cdc1c1..f1151e2dcf537d 100644 --- a/paddle/phi/api/include/compat/ATen/core/ivalue.h +++ b/paddle/phi/api/include/compat/ATen/core/ivalue.h @@ -224,6 +224,16 @@ class IValue { bool is_custom_class() const { return tag_ == TypeTag::CustomClass; } bool is_tuple() const { return tag_ == TypeTag::Tuple; } + bool isNone() const { return is_none(); } + bool isBool() const { return is_bool(); } + bool isInt() const { return is_int(); } + bool isDouble() const { return is_double(); } + bool isString() const { return is_string(); } + bool isList() const { return is_list(); } + bool isTensor() const { return is_tensor(); } + bool isCustomClass() const { return is_custom_class(); } + bool isTuple() const { return is_tuple(); } + bool to_bool() const { if (!is_bool()) throw std::runtime_error("Not a bool"); return std::get(value_); @@ -280,6 +290,39 @@ class IValue { return static_cast(std::get(value_)); } + bool toBool() const { return to_bool(); } + int64_t toInt() const { return to_int(); } + double toDouble() const { return to_double(); } + const std::string& toStringRef() const { return to_string(); } + std::string_view toStringView() const { return to_string_view(); } + at::Tensor toTensor() const { return to_tensor(); } + at::ScalarType toScalarType() const { return to_scalar_type(); } + + std::string tagKind() const { + switch (tag_) { + case TypeTag::None: + return "None"; + case TypeTag::Bool: + return "Bool"; + case TypeTag::Int: + return "Int"; + case TypeTag::Double: + return "Double"; + case TypeTag::String: + return "String"; + case TypeTag::Tensor: + return "Tensor"; + case TypeTag::GenericList: + return "GenericList"; + case TypeTag::CustomClass: + return "CustomClass"; + case TypeTag::Tuple: + return "Tuple"; + default: + return "InvalidTag"; + } + } + template intrusive_ptr to_custom_class() const { if (!is_custom_class()) throw std::runtime_error("Not a custom class"); @@ -637,3 +680,7 @@ intrusive_ptr generic_to(const IValue& ivalue, } } // namespace torch + +namespace c10 { +using IValue = ::torch::IValue; +} From 84aa0c03174abc836bdc25aa9bdb17f4aa3456c3 Mon Sep 17 00:00:00 2001 From: youge325 Date: Sat, 28 Mar 2026 20:36:20 +0800 Subject: [PATCH 3/8] add align tests --- test/cpp/compat/c10_layout_test.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/cpp/compat/c10_layout_test.cc b/test/cpp/compat/c10_layout_test.cc index 6e163aa8ee4c5a..1a0babeb7d3673 100644 --- a/test/cpp/compat/c10_layout_test.cc +++ b/test/cpp/compat/c10_layout_test.cc @@ -368,6 +368,9 @@ TEST(SparseConstructorTest, SparseCooTensorInferSize) { ASSERT_TRUE(sparse.is_sparse()); ASSERT_EQ(sparse.layout(), c10::kSparse); + ASSERT_EQ(sparse.dim(), 2); + ASSERT_EQ(sparse.size(0), 3); + ASSERT_EQ(sparse.size(1), 3); } TEST(SparseConstructorTest, SparseCooTensorDouble) { From f3d4070b09964a4d5fc9ed192c5b51c879d84227 Mon Sep 17 00:00:00 2001 From: youge325 Date: Sat, 28 Mar 2026 21:21:30 +0800 Subject: [PATCH 4/8] align some APIs --- .../api/include/compat/c10/cuda/CUDAGuard.h | 104 ++++++++++----- .../api/include/compat/c10/cuda/CUDAStream.h | 121 +++++++++++------- 2 files changed, 146 insertions(+), 79 deletions(-) diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h b/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h index ce819e69e64932..8f2193ef46ae0e 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h @@ -19,16 +19,43 @@ #pragma once #include +#include + #include + #include "paddle/phi/core/platform/cuda_device_guard.h" namespace c10::cuda { + +namespace detail { + +inline Device current_cuda_device() { + return Device(kCUDA, phi::backends::gpu::GetCurrentDeviceId()); +} + +inline Device normalize_cuda_device(Device device) { + TORCH_CHECK(device.is_cuda(), "Expected a CUDA device, but got ", device); + return Device(kCUDA, device.has_index() ? device.index() : 0); +} + +} // namespace detail + struct CUDAGuard { explicit CUDAGuard() = delete; // NOLINT - explicit CUDAGuard(DeviceIndex device_index) : guard_(device_index) {} + explicit CUDAGuard(DeviceIndex device_index) + : original_device_(detail::current_cuda_device()), + current_device_(original_device_), + guard_() { + set_index(device_index); + } - explicit CUDAGuard(Device device) : guard_(device._PD_GetInner()) {} + explicit CUDAGuard(Device device) + : original_device_(detail::current_cuda_device()), + current_device_(original_device_), + guard_() { + set_device(device); + } CUDAGuard(const CUDAGuard&) = delete; CUDAGuard& operator=(const CUDAGuard&) = delete; @@ -37,79 +64,86 @@ struct CUDAGuard { CUDAGuard& operator=(CUDAGuard&& other) = delete; ~CUDAGuard() = default; - void set_device(Device device) { guard_.SetDevice(device._PD_GetInner()); } + void set_device(Device device) { + current_device_ = detail::normalize_cuda_device(device); + guard_.SetDevice(current_device_._PD_GetInner()); + } void reset_device(Device device) { set_device(device); } void set_index(DeviceIndex device_index) { + current_device_ = Device(kCUDA, device_index); guard_.SetDeviceIndex(device_index); } - Device current_device() const { - return c10::Device(c10::kCUDA, phi::backends::gpu::GetCurrentDeviceId()); - } + Device original_device() const { return original_device_; } + + Device current_device() const { return current_device_; } private: + Device original_device_; + Device current_device_; paddle::platform::CUDADeviceGuard guard_; }; struct OptionalCUDAGuard { OptionalCUDAGuard() = default; - explicit OptionalCUDAGuard(std::optional device_opt) : guard_() { + explicit OptionalCUDAGuard(std::optional device_opt) { if (device_opt.has_value()) { - guard_.emplace(device_opt.value()._PD_GetInner()); + set_device(device_opt.value()); } } - explicit OptionalCUDAGuard(std::optional device_index_opt) - : guard_() { + explicit OptionalCUDAGuard(std::optional device_index_opt) { if (device_index_opt.has_value()) { - guard_.emplace(device_index_opt.value()); + set_index(device_index_opt.value()); } } - // Copy is not allowed OptionalCUDAGuard(const OptionalCUDAGuard&) = delete; OptionalCUDAGuard& operator=(const OptionalCUDAGuard&) = delete; OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete; - OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete; ~OptionalCUDAGuard() = default; void set_device(Device device) { - if (!guard_.has_value()) { - guard_.emplace(device._PD_GetInner()); - } else { - guard_->SetDevice(device._PD_GetInner()); - } + const Device normalized = detail::normalize_cuda_device(device); + init_if_needed(); + guard_->SetDevice(normalized._PD_GetInner()); + current_device_ = normalized; } - void reset_device(Device device) { - if (!guard_.has_value()) { - guard_.emplace(device._PD_GetInner()); - } else { - guard_->SetDevice(device._PD_GetInner()); - } - } + void reset_device(Device device) { set_device(device); } void set_index(DeviceIndex device_index) { - if (!guard_.has_value()) { - guard_.emplace(device_index); - } else { - guard_->SetDeviceIndex(device_index); - } + init_if_needed(); + guard_->SetDeviceIndex(device_index); + current_device_ = Device(kCUDA, device_index); } - std::optional current_device() const { - return guard_.has_value() - ? std::make_optional(c10::Device( - c10::kCUDA, phi::backends::gpu::GetCurrentDeviceId())) - : std::nullopt; + std::optional original_device() const { return original_device_; } + + std::optional current_device() const { return current_device_; } + + void reset() { + guard_.reset(); + original_device_.reset(); + current_device_.reset(); } private: + void init_if_needed() { + if (!guard_.has_value()) { + original_device_ = detail::current_cuda_device(); + current_device_ = original_device_; + guard_.emplace(); + } + } + + std::optional original_device_; + std::optional current_device_; std::optional guard_; }; diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h index f88dd043317252..f0f8f23ee1b711 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h @@ -18,9 +18,14 @@ #include #include +#include #include #include +#include #include +#include +#include + #include "paddle/phi/api/include/context_pool.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/place.h" @@ -30,16 +35,13 @@ namespace c10::cuda { using StreamId = int64_t; -// ── Per-device stream pool and per-thread current stream ───────────────────── +static constexpr int max_compile_time_stream_priorities = 4; namespace detail { constexpr int kStreamsPerPool = 32; -// Upper bound for static pool/TLS arrays. 64 covers all current CUDA hardware. constexpr int kMaxDevices = 64; -// Device count is invariant after CUDA initialization; cache it to avoid -// repeated driver calls on the hot path. inline int gpu_device_count() { static const int count = phi::backends::gpu::GetGPUDeviceCount(); return count; @@ -83,9 +85,6 @@ inline void init_pool(int device_index, StreamPoolState* state) { } } -// Per-thread, per-device current stream state. -// thread_local inside an inline function is ODR-safe across translation units -// (C++11 §3.2): all TUs share the same thread-local instance per thread. struct TLSStreamState { cudaStream_t streams[kMaxDevices]{}; bool has_stream[kMaxDevices]{}; @@ -98,22 +97,24 @@ inline TLSStreamState& get_tls() { } // namespace detail -// ── CUDAStream ─────────────────────────────────────────────────────────────── - class CUDAStream { public: + enum Unchecked { UNCHECKED }; + CUDAStream() = delete; explicit CUDAStream(Stream stream) : stream_(stream) { TORCH_CHECK(stream_.device_type() == DeviceType::CUDA); } + explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {} + bool operator==(const CUDAStream& other) const noexcept { - return stream_ == other.stream_; + return unwrap() == other.unwrap(); } bool operator!=(const CUDAStream& other) const noexcept { - return stream_ != other.stream_; + return unwrap() != other.unwrap(); } StreamId id() const { return stream_.id(); } @@ -122,6 +123,21 @@ class CUDAStream { operator Stream() const { return unwrap(); } + bool query() const { return unwrap().query(); } + + void synchronize() const { unwrap().synchronize(); } + + int priority() const { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + phi::backends::gpu::GPUDeviceGuard guard(device_index()); + int priority = 0; + C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority)); + return priority; +#else + return 0; +#endif + } + cudaStream_t stream() const { return reinterpret_cast(stream_.id()); } @@ -134,17 +150,34 @@ class CUDAStream { Device device() const { return Device(DeviceType::CUDA, device_index()); } - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // stream() - cudaStream_t raw_stream() const { return stream(); } + struct c10::StreamData3 pack3() const { + return stream_.pack3(); + } + + static CUDAStream unpack3(StreamId stream_id, + DeviceIndex device_index, + DeviceType device_type) { + return CUDAStream(Stream::unpack3(stream_id, device_index, device_type)); + } + + static std::tuple priority_range() { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + int least_priority = 0; + int greatest_priority = 0; + C10_CUDA_CHECK( + cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); + greatest_priority = + std::max(-max_compile_time_stream_priorities + 1, greatest_priority); + return std::make_tuple(least_priority, greatest_priority); +#else + return std::make_tuple(0, 0); +#endif + } private: Stream stream_; }; -// Build a CUDAStream from a raw platform stream handle and a device index. -// The handle is encoded as a StreamId via reinterpret_cast, matching Paddle's -// phi::Stream / phi::CUDAStream convention. inline CUDAStream make_cuda_stream(cudaStream_t raw, c10::DeviceIndex device_index) { c10::StreamId sid = @@ -155,14 +188,6 @@ inline CUDAStream make_cuda_stream(cudaStream_t raw, sid)); } -/** - * Get the current CUDA stream for the given device (or the current device if - * device_index == -1). - * - * Returns the per-thread current stream if one has been set via - * setCurrentCUDAStream() for this thread and device; otherwise falls back to - * Paddle's default stream for the device. - */ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) { if (device_index == -1) { device_index = phi::backends::gpu::GetCurrentDeviceId(); @@ -181,29 +206,18 @@ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) { return make_cuda_stream(raw, device_index); } -/** - * Get a stream from the per-device pool in round-robin fashion. - * Returns a high priority stream if isHighPriority is true. - * - * The pool is lazily initialized on first use for each device. Each device - * has kStreamsPerPool low-priority and kStreamsPerPool high-priority streams - * that are reused round-robin. Pool streams are always distinct from the - * current stream, enabling cross-stream dependency management and correct - * record_stream lifetime semantics. - */ -inline CUDAStream getStreamFromPool(const bool isHighPriority = false, +inline CUDAStream getStreamFromPool(const int priority, c10::DeviceIndex device_index = -1) { if (device_index == -1) { device_index = phi::backends::gpu::GetCurrentDeviceId(); } - // get_pool also performs bounds-checking on device_index. auto& state = detail::get_pool(device_index); std::call_once(state.init_flag, [device_index, &state]() { detail::init_pool(device_index, &state); }); cudaStream_t raw; - if (isHighPriority) { + if (priority < 0) { raw = state.high_priority[state.hp_counter.fetch_add(1) % detail::kStreamsPerPool]; } else { @@ -221,6 +235,17 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false, * local to the calling OS thread and does not affect any shared state such as * Paddle's GPUContext. Other threads continue to see their own current stream. */ +inline CUDAStream getStreamFromPool(const bool isHighPriority, + c10::DeviceIndex device_index) { + return getStreamFromPool(isHighPriority ? -1 : 0, device_index); +} + +inline CUDAStream getStreamFromExternal(cudaStream_t ext_stream, + c10::DeviceIndex device_index) { + detail::check_device_index(device_index); + return make_cuda_stream(ext_stream, device_index); +} + inline void setCurrentCUDAStream(CUDAStream stream) { c10::DeviceIndex idx = stream.unwrap().device_index(); detail::check_device_index(idx); @@ -234,22 +259,30 @@ inline CUDAStream getDefaultCUDAStream(c10::DeviceIndex device_index = -1) { device_index = phi::backends::gpu::GetCurrentDeviceId(); } detail::check_device_index(device_index); - // The default CUDA stream is always the null stream (cudaStreamDefault, - // handle == 0), regardless of any per-thread current stream override. - // This matches PyTorch semantics where getDefaultCUDAStream() returns the - // fixed device-level default stream, while getCurrentCUDAStream() returns - // the per-thread current stream (which may differ after - // setCurrentCUDAStream). return CUDAStream(c10::Stream( c10::Stream::DEFAULT, c10::Device(c10::DeviceType::CUDA, device_index))); } +inline std::ostream& operator<<(std::ostream& stream, const CUDAStream& s) { + return stream << s.unwrap(); +} + } // namespace c10::cuda +namespace std { +template <> +struct hash { + size_t operator()(c10::cuda::CUDAStream s) const noexcept { + return std::hash{}(s.unwrap()); + } +}; +} // namespace std + namespace at::cuda { using c10::cuda::CUDAStream; using c10::cuda::getCurrentCUDAStream; using c10::cuda::getDefaultCUDAStream; +using c10::cuda::getStreamFromExternal; using c10::cuda::getStreamFromPool; using c10::cuda::setCurrentCUDAStream; } // namespace at::cuda From 1d0432550c6508ddd97c50f56bf0144fa6095234 Mon Sep 17 00:00:00 2001 From: youge325 Date: Sun, 29 Mar 2026 12:46:39 +0800 Subject: [PATCH 5/8] align Allocator --- .../api/include/compat/c10/core/Allocator.h | 101 +++++++++++++++++- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/paddle/phi/api/include/compat/c10/core/Allocator.h b/paddle/phi/api/include/compat/c10/core/Allocator.h index a35070936031ad..14a9f2f7dd57d1 100644 --- a/paddle/phi/api/include/compat/c10/core/Allocator.h +++ b/paddle/phi/api/include/compat/c10/core/Allocator.h @@ -23,7 +23,9 @@ #include #include +#include #include +#include #include #include #include @@ -37,6 +39,15 @@ namespace c10 { // Deleter function pointer type (compatible with LibTorch) using DeleterFnPtr = void (*)(void*); +using CaptureId_t = uint64_t; +using MempoolId_t = std::pair; + +struct MempoolIdHash { + std::size_t operator()(const MempoolId_t& mempool_id) const noexcept { + return mempool_id.first != 0 ? mempool_id.first : mempool_id.second; + } +}; + // DataPtr class compatible with LibTorch's c10::DataPtr // Wraps a pointer with associated device and deleter class DataPtr { @@ -63,6 +74,7 @@ class DataPtr { void clear() { ptr_.clear(); } void* get() const { return ptr_.get(); } + void* mutable_get() { return ptr_.get(); } void* get_context() const { return ptr_.get_context(); } void* release_context() { return ptr_.release_context(); } @@ -128,15 +140,14 @@ struct Allocator { // Requires: input data was allocated by the same allocator. DataPtr clone(const void* data, std::size_t n) { auto new_data = allocate(n); - copy_data(new_data.get(), data, n); + copy_data(new_data.mutable_get(), data, n); return new_data; } // Checks if DataPtr has a simple context, not wrapped with any out of the // ordinary contexts. virtual bool is_simple_data_ptr(const DataPtr& data_ptr) const { - return data_ptr.get_context() == nullptr || - data_ptr.get_context() == data_ptr.get(); + return data_ptr.get() == data_ptr.get_context(); } // If this returns a non nullptr, it means that allocate() @@ -176,6 +187,90 @@ struct Allocator { } }; +struct InefficientStdFunctionContext { + void* ptr_{nullptr}; + std::function deleter_; + + InefficientStdFunctionContext(void* ptr, std::function deleter) + : ptr_(ptr), deleter_(std::move(deleter)) {} + + InefficientStdFunctionContext(const InefficientStdFunctionContext&) = delete; + + InefficientStdFunctionContext(InefficientStdFunctionContext&& rhs) noexcept + : ptr_(std::exchange(rhs.ptr_, nullptr)), + deleter_(std::move(rhs.deleter_)) {} + + InefficientStdFunctionContext& operator=( + const InefficientStdFunctionContext&) = delete; + + InefficientStdFunctionContext& operator=( + InefficientStdFunctionContext&& rhs) { + this->~InefficientStdFunctionContext(); + ptr_ = std::exchange(rhs.ptr_, nullptr); + deleter_ = std::move(rhs.deleter_); + return *this; + } + + ~InefficientStdFunctionContext() { + if (deleter_) { + deleter_(ptr_); + } + } + + static DataPtr makeDataPtr(void* ptr, + std::function deleter, + Device device) { + return DataPtr(ptr, + new InefficientStdFunctionContext(ptr, std::move(deleter)), + &deleteContext, + device); + } + + private: + static void deleteContext(void* ptr) { + delete static_cast(ptr); + } +}; + +inline constexpr size_t kAllocatorRegistrySize = + static_cast(DeviceType::CUSTOM) + 1; + +inline std::array g_allocator_array{}; +inline std::array g_allocator_priority{}; + +inline size_t allocator_device_index(DeviceType t) { + const size_t index = static_cast(t); + TORCH_CHECK(index < kAllocatorRegistrySize, + "Allocator device type index out of range: ", + index); + return index; +} + +inline void SetAllocator(DeviceType t, Allocator* alloc, uint8_t priority = 0) { + const size_t index = allocator_device_index(t); + if (priority >= g_allocator_priority[index]) { + g_allocator_array[index] = alloc; + g_allocator_priority[index] = priority; + } +} + +inline Allocator* GetAllocator(const DeviceType& t) { + const size_t index = allocator_device_index(t); + auto* alloc = g_allocator_array[index]; + TORCH_CHECK(alloc != nullptr, "Allocator for ", t, " is not set."); + return alloc; +} + +template +struct AllocatorRegisterer { + explicit AllocatorRegisterer(Allocator* alloc) { SetAllocator(t, alloc); } +}; + +#define REGISTER_ALLOCATOR(t, f) \ + namespace { \ + static c10::AllocatorRegisterer g_allocator_d(f); \ + } + } // namespace c10 namespace at { From 1ea7eea57ef70b37ea6f48b9cdc0b316c786b4b5 Mon Sep 17 00:00:00 2001 From: youge325 Date: Sun, 29 Mar 2026 21:33:23 +0800 Subject: [PATCH 6/8] add version.h, fix some cuda functions and storage test --- paddle/phi/api/include/compat/CMakeLists.txt | 1 + .../include/compat/c10/cuda/CUDAFunctions.cpp | 44 +++++++++++++++++++ .../include/compat/c10/cuda/CUDAFunctions.h | 25 +---------- .../torch/csrc/api/include/torch/version.h | 39 ++++++++++++++++ test/cpp/compat/c10_storage_test.cc | 8 +++- 5 files changed, 92 insertions(+), 25 deletions(-) create mode 100644 paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp create mode 100644 paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h diff --git a/paddle/phi/api/include/compat/CMakeLists.txt b/paddle/phi/api/include/compat/CMakeLists.txt index 3a3c939b44cba7..3f0c635261a0d8 100644 --- a/paddle/phi/api/include/compat/CMakeLists.txt +++ b/paddle/phi/api/include/compat/CMakeLists.txt @@ -1,5 +1,6 @@ collect_srcs(api_srcs SRCS c10/core/Device.cpp) collect_srcs(api_srcs SRCS c10/core/Stream.cpp) +collect_srcs(api_srcs SRCS c10/cuda/CUDAFunctions.cpp) collect_srcs(api_srcs SRCS c10/util/typeid.cpp) collect_srcs(api_srcs SRCS ATen/cuda/EmptyTensor.cpp) collect_srcs(api_srcs SRCS ATen/cuda/CUDAContextLight.cpp) diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp new file mode 100644 index 00000000000000..482bf7036d7cea --- /dev/null +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp @@ -0,0 +1,44 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +namespace c10::cuda { + +c10::DeviceIndex device_count() { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + return phi::backends::gpu::GetGPUDeviceCount(); +#else + // Return 0 instead of throwing to match PyTorch API semantics + // at::cuda::is_available() relies on this returning 0/false + return 0; +#endif +} + +void device_synchronize() { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + int curr_device_id = paddle::platform::GetCurrentDeviceId(); + paddle::platform::SetDeviceId(curr_device_id); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); +#else + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#endif +#else + PADDLE_THROW(common::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot visit device synchronize.")); +#endif +} + +} // namespace c10::cuda diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h index 086c3f486cc6ae..bcc097cadd033d 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h @@ -18,30 +18,9 @@ namespace c10::cuda { -inline c10::DeviceIndex device_count() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return phi::backends::gpu::GetGPUDeviceCount(); -#else - // Return 0 instead of throwing to match PyTorch API semantics - // at::cuda::is_available() relies on this returning 0/false - return 0; -#endif -} +c10::DeviceIndex device_count(); -inline void device_synchronize() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - int curr_device_id = paddle::platform::GetCurrentDeviceId(); - paddle::platform::SetDeviceId(curr_device_id); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); -#else - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#endif -#else - PADDLE_THROW(common::errors::Unavailable( - "Paddle is not compiled with CUDA. Cannot visit device synchronize.")); -#endif -} +void device_synchronize(); void __inline__ stream_synchronize(gpuStream_t stream) { phi::backends::gpu::GpuStreamSync(stream); diff --git a/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h new file mode 100644 index 00000000000000..f745837944df17 --- /dev/null +++ b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h @@ -0,0 +1,39 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +/// Indicates the major version of LibTorch. +#define TORCH_VERSION_MAJOR 2 + +/// Indicates the minor version of LibTorch. +#define TORCH_VERSION_MINOR 10 + +/// Indicates the patch version of LibTorch. +#define TORCH_VERSION_PATCH 0 + +/// Indicates the ABI version tag of LibTorch. +#define TORCH_VERSION_ABI_TAG 0 + +/// Indicates the version of LibTorch as a string literal. +#define TORCH_VERSION "2.10.0" + +/// Indicates the ABI version of LibTorch as a single uint64. +/// [ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ] +/// [ MAJ ][ MIN ][ PATCH][ ABI TAG ] +#define TORCH_ABI_VERSION \ + (((0ULL + TORCH_VERSION_MAJOR) << 56) | \ + ((0ULL + TORCH_VERSION_MINOR) << 48) | /* NOLINT(whitespace/indent) */ \ + ((0ULL + TORCH_VERSION_PATCH) << 40) | /* NOLINT(whitespace/indent) */ \ + ((0ULL + TORCH_VERSION_ABI_TAG) << 0)) /* NOLINT(whitespace/indent) */ diff --git a/test/cpp/compat/c10_storage_test.cc b/test/cpp/compat/c10_storage_test.cc index d94dee0d47b605..947786fb929870 100644 --- a/test/cpp/compat/c10_storage_test.cc +++ b/test/cpp/compat/c10_storage_test.cc @@ -732,9 +732,13 @@ TEST(StorageTest, DataPtrHelpersAndAllocatorSimpleDataPtrChecks) { dp.unsafe_set_device(c10::Device(c10::DeviceType::CPU)); ASSERT_EQ(dp.device().type(), c10::DeviceType::CPU); - // is_simple_data_ptr: context==nullptr branch. + // PyTorch only treats context==data as a simple DataPtr. RawCompatibleAllocator compatible_alloc; - ASSERT_TRUE(compatible_alloc.is_simple_data_ptr(dp)); + ASSERT_FALSE(compatible_alloc.is_simple_data_ptr(dp)); + + // is_simple_data_ptr: context==data branch. + c10::DataPtr simple = compatible_alloc.allocate(4); + ASSERT_TRUE(compatible_alloc.is_simple_data_ptr(simple)); // is_simple_data_ptr: context!=data branch. c10::DataPtr non_simple = RawIncompatibleAllocator().allocate(4); From fcb126aaf77e6877cd6c2a6fd7fb29ffab473673 Mon Sep 17 00:00:00 2001 From: youge325 Date: Mon, 30 Mar 2026 14:12:59 +0800 Subject: [PATCH 7/8] try to fix Windows build --- paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h index bcc097cadd033d..25a0301783b606 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h @@ -22,12 +22,17 @@ c10::DeviceIndex device_count(); void device_synchronize(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void __inline__ stream_synchronize(gpuStream_t stream) { phi::backends::gpu::GpuStreamSync(stream); } +#endif + } // namespace c10::cuda namespace at::cuda { using c10::cuda::device_synchronize; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) using c10::cuda::stream_synchronize; +#endif } // namespace at::cuda From 53cb19c603b182add90dcbbe7d258dcc54a69ab9 Mon Sep 17 00:00:00 2001 From: youge325 Date: Thu, 2 Apr 2026 17:08:25 +0800 Subject: [PATCH 8/8] revert raw_stream() and fix getStreamFromPool --- .../api/include/compat/c10/cuda/CUDAStream.h | 22 +++++++++++++------ test/cpp/compat/c10_Stream_test.cc | 20 +++++++++++++++++ 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h index f0f8f23ee1b711..2ae22655f8423b 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h @@ -150,6 +150,8 @@ class CUDAStream { Device device() const { return Device(DeviceType::CUDA, device_index()); } + cudaStream_t raw_stream() const { return stream(); } + struct c10::StreamData3 pack3() const { return stream_.pack3(); } @@ -228,15 +230,13 @@ inline CUDAStream getStreamFromPool(const int priority, } /** - * Set the current CUDA stream for the device of the given stream in the - * calling thread. + * Get a new stream from the CUDA stream pool. * - * Implements per-thread, per-device current stream semantics: the change is - * local to the calling OS thread and does not affect any shared state such as - * Paddle's GPUContext. Other threads continue to see their own current stream. + * This overload matches PyTorch's bool-based entry point and preserves the + * single-argument form `getStreamFromPool(true)` for high-priority requests. */ -inline CUDAStream getStreamFromPool(const bool isHighPriority, - c10::DeviceIndex device_index) { +inline CUDAStream getStreamFromPool(const bool isHighPriority = false, + c10::DeviceIndex device_index = -1) { return getStreamFromPool(isHighPriority ? -1 : 0, device_index); } @@ -246,6 +246,14 @@ inline CUDAStream getStreamFromExternal(cudaStream_t ext_stream, return make_cuda_stream(ext_stream, device_index); } +/** + * Set the current CUDA stream for the device of the given stream in the + * calling thread. + * + * Implements per-thread, per-device current stream semantics: the change is + * local to the calling OS thread and does not affect any shared state such as + * Paddle's GPUContext. Other threads continue to see their own current stream. + */ inline void setCurrentCUDAStream(CUDAStream stream) { c10::DeviceIndex idx = stream.unwrap().device_index(); detail::check_device_index(idx); diff --git a/test/cpp/compat/c10_Stream_test.cc b/test/cpp/compat/c10_Stream_test.cc index 7ee6a4c5f682d4..d41421c10abe53 100644 --- a/test/cpp/compat/c10_Stream_test.cc +++ b/test/cpp/compat/c10_Stream_test.cc @@ -112,6 +112,26 @@ TEST(CUDAStreamTest, DefaultStreamIsStable) { EXPECT_EQ(s1, s2); } +TEST(CUDAStreamTest, GetStreamFromPoolBoolOverloadPreservesHighPriority) { + SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto low_priority_stream = + c10::cuda::getStreamFromPool(/*isHighPriority=*/false); + auto high_priority_stream = + c10::cuda::getStreamFromPool(/*isHighPriority=*/true); + auto explicit_high_priority_stream = c10::cuda::getStreamFromPool(-1); + + const int low_priority = low_priority_stream.priority(); + const int high_priority = high_priority_stream.priority(); + const int explicit_high_priority = explicit_high_priority_stream.priority(); + + if (low_priority == explicit_high_priority) { + return; + } + + EXPECT_EQ(high_priority, explicit_high_priority); + EXPECT_NE(high_priority, low_priority); +} + // After setCurrentCUDAStream redirects the per-thread current stream, // getDefaultCUDAStream must still return the null stream. TEST(CUDAStreamTest, DefaultStreamUnaffectedBySetCurrentCUDAStream) {