diff --git a/.gitignore b/.gitignore index b2a73328821657..abd75c85cc55ab 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ third_party/ *~ bazel-* .humanize +.codex build_* # clion workspace. diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h index a12dfba80b5bec..6445befda15ee5 100644 --- a/paddle/phi/api/include/compat/ATen/core/TensorBody.h +++ b/paddle/phi/api/include/compat/ATen/core/TensorBody.h @@ -687,12 +687,6 @@ class Tensor : public TensorBase { } void record_stream(at::Stream s) const; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - void record_stream(at::cuda::CUDAStream s) const; - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // at::Stream - void record_stream(cudaStream_t s) const; -#endif Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); } diff --git a/paddle/phi/api/include/compat/ATen/core/ivalue.h b/paddle/phi/api/include/compat/ATen/core/ivalue.h index d22e19c4cdc1c1..f1151e2dcf537d 100644 --- a/paddle/phi/api/include/compat/ATen/core/ivalue.h +++ b/paddle/phi/api/include/compat/ATen/core/ivalue.h @@ -224,6 +224,16 @@ class IValue { bool is_custom_class() const { return tag_ == TypeTag::CustomClass; } bool is_tuple() const { return tag_ == TypeTag::Tuple; } + bool isNone() const { return is_none(); } + bool isBool() const { return is_bool(); } + bool isInt() const { return is_int(); } + bool isDouble() const { return is_double(); } + bool isString() const { return is_string(); } + bool isList() const { return is_list(); } + bool isTensor() const { return is_tensor(); } + bool isCustomClass() const { return is_custom_class(); } + bool isTuple() const { return is_tuple(); } + bool to_bool() const { if (!is_bool()) throw std::runtime_error("Not a bool"); return std::get(value_); @@ -280,6 +290,39 @@ class IValue { return static_cast(std::get(value_)); } + bool toBool() const { return to_bool(); } + int64_t toInt() const { return to_int(); } + double toDouble() const { return to_double(); } + const std::string& toStringRef() const { return to_string(); } + std::string_view toStringView() const { return to_string_view(); } + at::Tensor toTensor() const { return to_tensor(); } + at::ScalarType toScalarType() const { return to_scalar_type(); } + + std::string tagKind() const { + switch (tag_) { + case TypeTag::None: + return "None"; + case TypeTag::Bool: + return "Bool"; + case TypeTag::Int: + return "Int"; + case TypeTag::Double: + return "Double"; + case TypeTag::String: + return "String"; + case TypeTag::Tensor: + return "Tensor"; + case TypeTag::GenericList: + return "GenericList"; + case TypeTag::CustomClass: + return "CustomClass"; + case TypeTag::Tuple: + return "Tuple"; + default: + return "InvalidTag"; + } + } + template intrusive_ptr to_custom_class() const { if (!is_custom_class()) throw std::runtime_error("Not a custom class"); @@ -637,3 +680,7 @@ intrusive_ptr generic_to(const IValue& ivalue, } } // namespace torch + +namespace c10 { +using IValue = ::torch::IValue; +} diff --git a/paddle/phi/api/include/compat/ATen/ops/arange.h b/paddle/phi/api/include/compat/ATen/ops/arange.h index 54f08137119c45..b0e05d4e9ed56d 100644 --- a/paddle/phi/api/include/compat/ATen/ops/arange.h +++ b/paddle/phi/api/include/compat/ATen/ops/arange.h @@ -25,50 +25,48 @@ namespace at { -inline at::Tensor arange(const at::Scalar& end, - at::TensorOptions options = {}) { - if (options.pinned_memory()) { - // Pinning memory is only supported for CPU tensors - if (options.has_device() && !options.device().is_cpu()) { - PD_THROW( - "pin_memory=true requires device to be CPU, but got non-CPU device"); - } - phi::Place base_place = options._PD_GetPlace(); - phi::Place pinned_place = compat::_PD_GetCreatePinnedPlace(base_place); - auto dense = paddle::experimental::arange( - paddle::experimental::full({}, 0, phi::DataType::FLOAT64), - paddle::experimental::full( - {}, end.to(), phi::DataType::FLOAT64), - paddle::experimental::full({}, 1, phi::DataType::FLOAT64), - compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()), - phi::CPUPlace()); - return dense.copy_to(pinned_place, /*blocking=*/true); +namespace detail { + +inline bool _PD_IsIntegralArangeScalar(const at::Scalar& scalar) { + switch (scalar.dtype()) { + case phi::DataType::BOOL: + case phi::DataType::UINT8: + case phi::DataType::INT8: + case phi::DataType::UINT16: + case phi::DataType::INT16: + case phi::DataType::UINT32: + case phi::DataType::INT32: + case phi::DataType::UINT64: + case phi::DataType::INT64: + return true; + default: + return false; } - return paddle::experimental::arange( - paddle::experimental::full({}, 0, phi::DataType::FLOAT64), - paddle::experimental::full({}, end.to(), phi::DataType::FLOAT64), - paddle::experimental::full({}, 1, phi::DataType::FLOAT64), - compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()), - options._PD_GetPlace()); } -inline at::Tensor arange(const at::Scalar& end, - ::std::optional dtype, - ::std::optional layout, - ::std::optional device, - ::std::optional pin_memory) { - auto options = - at::TensorOptions() - .dtype(dtype.value_or(c10::get_default_dtype_as_scalartype())) - .layout(layout) - .device(device.value_or(at::kCPU)) - .pinned_memory(pin_memory); - return arange(end, options); +inline at::ScalarType _PD_ResolveArangeDtype(const at::Scalar& start, + const at::Scalar& end, + const at::Scalar& step, + const at::TensorOptions& options) { + if (options.has_dtype()) { + return options.dtype().toScalarType(); + } + if (_PD_IsIntegralArangeScalar(start) && _PD_IsIntegralArangeScalar(end) && + _PD_IsIntegralArangeScalar(step)) { + return at::kLong; + } + return c10::get_default_dtype_as_scalartype(); } +} // namespace detail + inline at::Tensor arange(const at::Scalar& start, const at::Scalar& end, + const at::Scalar& step, at::TensorOptions options = {}) { + // Match PyTorch: step must be non-zero and consistent with (end - start). + at::native::arange_check_bounds(start, end, step); + auto dtype = detail::_PD_ResolveArangeDtype(start, end, step, options); if (options.pinned_memory()) { // Pinning memory is only supported for CPU tensors if (options.has_device() && !options.device().is_cpu()) { @@ -82,8 +80,9 @@ inline at::Tensor arange(const at::Scalar& start, {}, start.to(), phi::DataType::FLOAT64), paddle::experimental::full( {}, end.to(), phi::DataType::FLOAT64), - paddle::experimental::full({}, 1, phi::DataType::FLOAT64), - compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()), + paddle::experimental::full( + {}, step.to(), phi::DataType::FLOAT64), + compat::_PD_AtenScalarTypeToPhiDataType(dtype), phi::CPUPlace()); return dense.copy_to(pinned_place, /*blocking=*/true); } @@ -91,58 +90,47 @@ inline at::Tensor arange(const at::Scalar& start, paddle::experimental::full( {}, start.to(), phi::DataType::FLOAT64), paddle::experimental::full({}, end.to(), phi::DataType::FLOAT64), - paddle::experimental::full({}, 1, phi::DataType::FLOAT64), - compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()), + paddle::experimental::full({}, step.to(), phi::DataType::FLOAT64), + compat::_PD_AtenScalarTypeToPhiDataType(dtype), options._PD_GetPlace()); } -inline at::Tensor arange(const at::Scalar& start, - const at::Scalar& end, +inline at::Tensor arange(const at::Scalar& end, + at::TensorOptions options = {}) { + return arange(/*start=*/0, end, /*step=*/1, options); +} + +inline at::Tensor arange(const at::Scalar& end, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { - auto options = - at::TensorOptions() - .dtype(dtype.value_or(c10::get_default_dtype_as_scalartype())) - .layout(layout) - .device(device.value_or(at::kCPU)) - .pinned_memory(pin_memory); - return arange(start, end, options); + auto options = at::TensorOptions() + .dtype(dtype) + .layout(layout) + .device(device) + .pinned_memory(pin_memory); + return arange(/*start=*/0, end, /*step=*/1, options); } inline at::Tensor arange(const at::Scalar& start, const at::Scalar& end, - const at::Scalar& step, at::TensorOptions options = {}) { - // Match PyTorch: step must be non-zero and consistent with (end - start). - at::native::arange_check_bounds(start, end, step); - if (options.pinned_memory()) { - // Pinning memory is only supported for CPU tensors - if (options.has_device() && !options.device().is_cpu()) { - PD_THROW( - "pin_memory=true requires device to be CPU, but got non-CPU device"); - } - phi::Place base_place = options._PD_GetPlace(); - phi::Place pinned_place = compat::_PD_GetCreatePinnedPlace(base_place); - auto dense = paddle::experimental::arange( - paddle::experimental::full( - {}, start.to(), phi::DataType::FLOAT64), - paddle::experimental::full( - {}, end.to(), phi::DataType::FLOAT64), - paddle::experimental::full( - {}, step.to(), phi::DataType::FLOAT64), - compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()), - phi::CPUPlace()); - return dense.copy_to(pinned_place, /*blocking=*/true); - } - return paddle::experimental::arange( - paddle::experimental::full( - {}, start.to(), phi::DataType::FLOAT64), - paddle::experimental::full({}, end.to(), phi::DataType::FLOAT64), - paddle::experimental::full({}, step.to(), phi::DataType::FLOAT64), - compat::_PD_AtenScalarTypeToPhiDataType(options.dtype()), - options._PD_GetPlace()); + return arange(start, end, /*step=*/1, options); +} + +inline at::Tensor arange(const at::Scalar& start, + const at::Scalar& end, + ::std::optional dtype, + ::std::optional layout, + ::std::optional device, + ::std::optional pin_memory) { + auto options = at::TensorOptions() + .dtype(dtype) + .layout(layout) + .device(device) + .pinned_memory(pin_memory); + return arange(start, end, /*step=*/1, options); } inline at::Tensor arange(const at::Scalar& start, @@ -152,12 +140,11 @@ inline at::Tensor arange(const at::Scalar& start, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { - auto options = - at::TensorOptions() - .dtype(dtype.value_or(c10::get_default_dtype_as_scalartype())) - .layout(layout) - .device(device.value_or(at::kCPU)) - .pinned_memory(pin_memory); + auto options = at::TensorOptions() + .dtype(dtype) + .layout(layout) + .device(device) + .pinned_memory(pin_memory); return arange(start, end, step, options); } diff --git a/paddle/phi/api/include/compat/ATen/ops/equal.h b/paddle/phi/api/include/compat/ATen/ops/equal.h index 4619144f8f5aba..1ac49d9e245d74 100644 --- a/paddle/phi/api/include/compat/ATen/ops/equal.h +++ b/paddle/phi/api/include/compat/ATen/ops/equal.h @@ -22,6 +22,12 @@ namespace at { inline bool equal(const at::Tensor& self, const at::Tensor& other) { + PD_CHECK(self.defined(), + "Expected a proper Tensor but got None (or an undefined Tensor in " + "C++)"); + PD_CHECK(other.defined(), + "Expected a proper Tensor but got None (or an undefined Tensor in " + "C++)"); PD_CHECK(self.device() == other.device(), "Cannot compare two tensors on " "different devices. Got: ", diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h index 73cb5dd4b2247c..ff43391cdc796c 100644 --- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h +++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h @@ -49,22 +49,4 @@ inline void Tensor::record_stream(at::Stream s) const { #endif } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -inline void Tensor::record_stream(at::cuda::CUDAStream s) const { - record_stream(static_cast(s)); -} - -// TODO(youge325): Remove after DeepEP paddle branch is updated to use -// at::Stream -inline void Tensor::record_stream(cudaStream_t s) const { - auto dense_tensor = - std::dynamic_pointer_cast(tensor_.impl()); - PD_CHECK(dense_tensor != nullptr, - "record_stream only supports DenseTensor, but got a non-dense " - "tensor implementation."); - PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU, - "record_stream is not supported for CPU tensors."); - paddle::memory::RecordStream(dense_tensor->Holder(), s); -} -#endif } // namespace at diff --git a/paddle/phi/api/include/compat/ATen/ops/resize.h b/paddle/phi/api/include/compat/ATen/ops/resize.h index 44232e54ff9353..ee4807027d0b31 100644 --- a/paddle/phi/api/include/compat/ATen/ops/resize.h +++ b/paddle/phi/api/include/compat/ATen/ops/resize.h @@ -23,13 +23,31 @@ namespace at { -// resize_ - in-place resize using reshape +// resize_ - use reshape for same-numel cases and set_ for storage-changing +// cases so repeated resize_ calls stay stable. inline const at::Tensor& Tensor::resize_( at::IntArrayRef size, ::std::optional memory_format) const { - auto result = - paddle::experimental::reshape(tensor_, size._PD_ToPaddleIntArray()); - const_cast(this)->tensor_ = result; + if (memory_format.has_value()) { + TORCH_CHECK(*memory_format == at::MemoryFormat::Contiguous, + "resize_ only supports contiguous memory format, but got ", + static_cast(*memory_format)); + } + + std::vector dims(size.begin(), size.end()); + int64_t new_numel = 1; + for (auto dim : dims) { + new_numel *= dim; + } + + if (tensor_.numel() == new_numel) { + const_cast(this)->tensor_ = + paddle::experimental::reshape(tensor_, phi::IntArray(dims)); + return *this; + } + + auto source = tensor_.copy_to(tensor_.place(), /*blocking=*/true); + paddle::experimental::set_(const_cast(this)->tensor_, source, dims); return *this; } diff --git a/paddle/phi/api/include/compat/ATen/ops/select.h b/paddle/phi/api/include/compat/ATen/ops/select.h index 8c859da44349b6..6c522db600add2 100644 --- a/paddle/phi/api/include/compat/ATen/ops/select.h +++ b/paddle/phi/api/include/compat/ATen/ops/select.h @@ -19,13 +19,35 @@ namespace at { inline at::Tensor select(const at::Tensor& self, int64_t dim, int64_t index) { + // Normalize dim to positive value for error messages + int64_t orig_dim = dim; if (dim < 0) { dim += self.dim(); } - // Handle negative indexing + // Check dim is valid + if (dim < 0 || dim >= self.dim()) { + PD_CHECK(false, + "select(): index ", + orig_dim, + " out of range for tensor of size ", + self.sizes(), + " at dimension ", + orig_dim); + } + // Handle negative index + int64_t orig_index = index; if (index < 0) { - int64_t dim_size = self.size(dim); - index = dim_size + index; + index = self.size(dim) + index; + } + // Check index is valid + if (index < 0 || index >= self.size(dim)) { + PD_CHECK(false, + "select(): index ", + orig_index, + " out of range for tensor of size ", + self.sizes(), + " at dimension ", + orig_dim < 0 ? orig_dim + self.dim() : orig_dim); } return Tensor( diff --git a/paddle/phi/api/include/compat/ATen/ops/std.h b/paddle/phi/api/include/compat/ATen/ops/std.h index b8600de2c857d9..ed7875f64431c1 100644 --- a/paddle/phi/api/include/compat/ATen/ops/std.h +++ b/paddle/phi/api/include/compat/ATen/ops/std.h @@ -32,6 +32,21 @@ inline Tensor std_impl(const Tensor& self, const std::vector& dims_vec, double correction_value, bool keepdim) { + // Validate dimensions before processing + int64_t ndim = self.dim(); + for (int64_t d : dims_vec) { + int64_t dim_idx = d < 0 ? d + ndim : d; + if (dim_idx < 0 || dim_idx >= ndim) { + PD_CHECK(false, + "Dimension out of range (expected to be in range of [", + -ndim, + ", ", + ndim - 1, + "], but got ", + d, + ")"); + } + } phi::IntArray dims_int_array(dims_vec); paddle::Tensor tensor = self._PD_GetInner(); diff --git a/paddle/phi/api/include/compat/CMakeLists.txt b/paddle/phi/api/include/compat/CMakeLists.txt index 3a3c939b44cba7..3f0c635261a0d8 100644 --- a/paddle/phi/api/include/compat/CMakeLists.txt +++ b/paddle/phi/api/include/compat/CMakeLists.txt @@ -1,5 +1,6 @@ collect_srcs(api_srcs SRCS c10/core/Device.cpp) collect_srcs(api_srcs SRCS c10/core/Stream.cpp) +collect_srcs(api_srcs SRCS c10/cuda/CUDAFunctions.cpp) collect_srcs(api_srcs SRCS c10/util/typeid.cpp) collect_srcs(api_srcs SRCS ATen/cuda/EmptyTensor.cpp) collect_srcs(api_srcs SRCS ATen/cuda/CUDAContextLight.cpp) diff --git a/paddle/phi/api/include/compat/c10/core/Allocator.h b/paddle/phi/api/include/compat/c10/core/Allocator.h index a35070936031ad..14a9f2f7dd57d1 100644 --- a/paddle/phi/api/include/compat/c10/core/Allocator.h +++ b/paddle/phi/api/include/compat/c10/core/Allocator.h @@ -23,7 +23,9 @@ #include #include +#include #include +#include #include #include #include @@ -37,6 +39,15 @@ namespace c10 { // Deleter function pointer type (compatible with LibTorch) using DeleterFnPtr = void (*)(void*); +using CaptureId_t = uint64_t; +using MempoolId_t = std::pair; + +struct MempoolIdHash { + std::size_t operator()(const MempoolId_t& mempool_id) const noexcept { + return mempool_id.first != 0 ? mempool_id.first : mempool_id.second; + } +}; + // DataPtr class compatible with LibTorch's c10::DataPtr // Wraps a pointer with associated device and deleter class DataPtr { @@ -63,6 +74,7 @@ class DataPtr { void clear() { ptr_.clear(); } void* get() const { return ptr_.get(); } + void* mutable_get() { return ptr_.get(); } void* get_context() const { return ptr_.get_context(); } void* release_context() { return ptr_.release_context(); } @@ -128,15 +140,14 @@ struct Allocator { // Requires: input data was allocated by the same allocator. DataPtr clone(const void* data, std::size_t n) { auto new_data = allocate(n); - copy_data(new_data.get(), data, n); + copy_data(new_data.mutable_get(), data, n); return new_data; } // Checks if DataPtr has a simple context, not wrapped with any out of the // ordinary contexts. virtual bool is_simple_data_ptr(const DataPtr& data_ptr) const { - return data_ptr.get_context() == nullptr || - data_ptr.get_context() == data_ptr.get(); + return data_ptr.get() == data_ptr.get_context(); } // If this returns a non nullptr, it means that allocate() @@ -176,6 +187,90 @@ struct Allocator { } }; +struct InefficientStdFunctionContext { + void* ptr_{nullptr}; + std::function deleter_; + + InefficientStdFunctionContext(void* ptr, std::function deleter) + : ptr_(ptr), deleter_(std::move(deleter)) {} + + InefficientStdFunctionContext(const InefficientStdFunctionContext&) = delete; + + InefficientStdFunctionContext(InefficientStdFunctionContext&& rhs) noexcept + : ptr_(std::exchange(rhs.ptr_, nullptr)), + deleter_(std::move(rhs.deleter_)) {} + + InefficientStdFunctionContext& operator=( + const InefficientStdFunctionContext&) = delete; + + InefficientStdFunctionContext& operator=( + InefficientStdFunctionContext&& rhs) { + this->~InefficientStdFunctionContext(); + ptr_ = std::exchange(rhs.ptr_, nullptr); + deleter_ = std::move(rhs.deleter_); + return *this; + } + + ~InefficientStdFunctionContext() { + if (deleter_) { + deleter_(ptr_); + } + } + + static DataPtr makeDataPtr(void* ptr, + std::function deleter, + Device device) { + return DataPtr(ptr, + new InefficientStdFunctionContext(ptr, std::move(deleter)), + &deleteContext, + device); + } + + private: + static void deleteContext(void* ptr) { + delete static_cast(ptr); + } +}; + +inline constexpr size_t kAllocatorRegistrySize = + static_cast(DeviceType::CUSTOM) + 1; + +inline std::array g_allocator_array{}; +inline std::array g_allocator_priority{}; + +inline size_t allocator_device_index(DeviceType t) { + const size_t index = static_cast(t); + TORCH_CHECK(index < kAllocatorRegistrySize, + "Allocator device type index out of range: ", + index); + return index; +} + +inline void SetAllocator(DeviceType t, Allocator* alloc, uint8_t priority = 0) { + const size_t index = allocator_device_index(t); + if (priority >= g_allocator_priority[index]) { + g_allocator_array[index] = alloc; + g_allocator_priority[index] = priority; + } +} + +inline Allocator* GetAllocator(const DeviceType& t) { + const size_t index = allocator_device_index(t); + auto* alloc = g_allocator_array[index]; + TORCH_CHECK(alloc != nullptr, "Allocator for ", t, " is not set."); + return alloc; +} + +template +struct AllocatorRegisterer { + explicit AllocatorRegisterer(Allocator* alloc) { SetAllocator(t, alloc); } +}; + +#define REGISTER_ALLOCATOR(t, f) \ + namespace { \ + static c10::AllocatorRegisterer g_allocator_d(f); \ + } + } // namespace c10 namespace at { diff --git a/paddle/phi/api/include/compat/c10/core/Device.cpp b/paddle/phi/api/include/compat/c10/core/Device.cpp index 9a170f10a335be..db8e74c3feb24f 100644 --- a/paddle/phi/api/include/compat/c10/core/Device.cpp +++ b/paddle/phi/api/include/compat/c10/core/Device.cpp @@ -18,7 +18,13 @@ #include #include + +#include #include +#include +#include +#include + #include "paddle/common/enforce.h" namespace c10 { @@ -45,44 +51,98 @@ const char* DeviceTypeToString(DeviceType type) { DeviceType parse_type(const std::string& device_string) { static const std::array, - static_cast(4)> + static_cast(5)> types = {{ {"cpu", DeviceType::CPU}, {"cuda", DeviceType::CUDA}, {"ipu", DeviceType::IPU}, {"xpu", DeviceType::XPU}, + {"privateuseone", DeviceType::PrivateUse1}, }}; - for (const auto& type_pair : types) { - if (device_string == type_pair.first) { - return type_pair.second; - } + auto device = std::find_if( + types.begin(), + types.end(), + [&device_string](const std::pair& p) { + return p.first && p.first == device_string; + }); + if (device != types.end()) { + return device->second; } - PADDLE_THROW(::common::errors::InvalidArgument( - "Unknown device type: '%s'. Supported device types are ", - "'cpu', 'cuda', 'ipu', and 'xpu'.", - device_string)); + TORCH_CHECK(false, + "Expected one of cpu, cuda, ipu, xpu, privateuseone device type " + "at start of device string: ", + device_string); } +enum DeviceStringParsingState { kStart, kIndexStart, kIndexRest, kError }; + Device::Device(const std::string& device_string) : Device(Type::CPU) { TORCH_CHECK(!device_string.empty(), "Device string must not be empty"); - auto colon_pos = device_string.find(':'); - std::string type_str = colon_pos == std::string::npos - ? device_string - : device_string.substr(0, colon_pos); - type_ = parse_type(type_str); - index_ = -1; - if (colon_pos != std::string::npos) { - std::string index_str = device_string.substr(colon_pos + 1); - try { - index_ = static_cast(std::stoi(index_str)); - } catch (const std::invalid_argument&) { - PADDLE_THROW(::common::errors::InvalidArgument( - "Invalid device index: '%s' is not a number.", index_str)); - } catch (const std::out_of_range&) { - PADDLE_THROW(::common::errors::InvalidArgument( - "Invalid device index: '%s' is out of range.", index_str)); + + std::string device_name, device_index_str; + DeviceStringParsingState pstate = DeviceStringParsingState::kStart; + + for (size_t i = 0; + pstate != DeviceStringParsingState::kError && i < device_string.size(); + ++i) { + const char ch = device_string.at(i); + const unsigned char uch = static_cast(ch); + switch (pstate) { + case DeviceStringParsingState::kStart: + if (ch != ':') { + if (std::isalpha(uch) || ch == '_') { + device_name.push_back(ch); + } else { + pstate = DeviceStringParsingState::kError; + } + } else { + pstate = DeviceStringParsingState::kIndexStart; + } + break; + case DeviceStringParsingState::kIndexStart: + if (std::isdigit(uch)) { + device_index_str.push_back(ch); + pstate = DeviceStringParsingState::kIndexRest; + } else { + pstate = DeviceStringParsingState::kError; + } + break; + case DeviceStringParsingState::kIndexRest: + if (device_index_str.at(0) == '0') { + pstate = DeviceStringParsingState::kError; + break; + } + if (std::isdigit(uch)) { + device_index_str.push_back(ch); + } else { + pstate = DeviceStringParsingState::kError; + } + break; + case DeviceStringParsingState::kError: + break; + } + } + + const bool has_error = device_name.empty() || + pstate == DeviceStringParsingState::kError || + (pstate == DeviceStringParsingState::kIndexStart && + device_index_str.empty()); + TORCH_CHECK(!has_error, "Invalid device string: '", device_string, "'"); + + try { + if (!device_index_str.empty()) { + index_ = static_cast(std::stoi(device_index_str)); } + } catch (const std::exception&) { + TORCH_CHECK(false, + "Could not parse device index '", + device_index_str, + "' in device string '", + device_string, + "'"); } + type_ = parse_type(device_name); + validate(); } std::string Device::str() const { diff --git a/paddle/phi/api/include/compat/c10/core/Device.h b/paddle/phi/api/include/compat/c10/core/Device.h index 5197d9a52790e9..e2d9166014577c 100644 --- a/paddle/phi/api/include/compat/c10/core/Device.h +++ b/paddle/phi/api/include/compat/c10/core/Device.h @@ -24,7 +24,12 @@ using gpuStream_t = hipStream_t; #endif #include +#include +#include +#include +#include +#include #include #include @@ -42,13 +47,19 @@ struct Device final { index_(place.GetType() == phi::AllocationType::CPU ? static_cast(-1) : place.GetDeviceId()), - custom_device_type_(place.GetDeviceType()) {} + custom_device_type_(place.GetDeviceType()) { + validate(); + } Device(DeviceType type, DeviceIndex index = -1) - : type_(type), index_(index) {} // NOLINT + : type_(type), index_(index) { // NOLINT + validate(); + } Device(DeviceType type, DeviceIndex index, std::string custom_device_type) : type_(type), index_(index), - custom_device_type_(std::move(custom_device_type)) {} // NOLINT + custom_device_type_(std::move(custom_device_type)) { // NOLINT + validate(); + } /// Constructs a `Device` from a string description, for convenience. /// The string supplied must follow the following schema: @@ -63,10 +74,51 @@ struct Device final { DeviceType type() const noexcept { return type_; } + bool operator!=(const Device& other) const noexcept { + return !(*this == other); + } + + void set_index(DeviceIndex index) { + index_ = index; + validate(); + } + bool is_cuda() const noexcept { return type_ == DeviceType::CUDA; } + bool is_privateuseone() const noexcept { + return type_ == DeviceType::PrivateUse1; + } + + bool is_mps() const noexcept { return false; } + + bool is_hip() const noexcept { return false; } + + bool is_ve() const noexcept { return false; } + + bool is_xpu() const noexcept { return type_ == DeviceType::XPU; } + + bool is_ipu() const noexcept { return type_ == DeviceType::IPU; } + + bool is_xla() const noexcept { return false; } + + bool is_mtia() const noexcept { return false; } + + bool is_hpu() const noexcept { return false; } + + bool is_lazy() const noexcept { return false; } + + bool is_vulkan() const noexcept { return false; } + + bool is_metal() const noexcept { return false; } + + bool is_maia() const noexcept { return false; } + + bool is_meta() const noexcept { return false; } + bool is_cpu() const noexcept { return type_ == DeviceType::CPU; } + bool supports_as_strided() const noexcept { return type_ != DeviceType::IPU; } + std::string str() const; bool operator==(const Device& other) const noexcept { @@ -96,12 +148,37 @@ struct Device final { DeviceType type_{DeviceType::CPU}; DeviceIndex index_{-1}; std::string custom_device_type_; + + void validate() { +#ifndef NDEBUG + TORCH_INTERNAL_ASSERT(index_ >= -1, + "Device index must be -1 or non-negative, got ", + static_cast(index_)); + TORCH_INTERNAL_ASSERT(!is_cpu() || index_ <= 0, + "CPU device index must be -1 or zero, got ", + static_cast(index_)); +#endif + } }; std::ostream& operator<<(std::ostream& stream, const Device& device); } // namespace c10 +namespace std { +template <> +struct hash { + size_t operator()(c10::Device d) const noexcept { + static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit"); + static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit"); + uint32_t bits = static_cast(static_cast(d.type())) + << 16 | + static_cast(static_cast(d.index())); + return std::hash{}(bits); + } +}; +} // namespace std + namespace at { using c10::Device; using c10::DeviceIndex; diff --git a/paddle/phi/api/include/compat/c10/core/DeviceType.h b/paddle/phi/api/include/compat/c10/core/DeviceType.h index 9e6487dbadb061..a67b5d880e0b3e 100644 --- a/paddle/phi/api/include/compat/c10/core/DeviceType.h +++ b/paddle/phi/api/include/compat/c10/core/DeviceType.h @@ -14,6 +14,8 @@ #pragma once +#include +#include #include #include "paddle/phi/common/place.h" @@ -26,6 +28,7 @@ enum class DeviceType : int8_t { XPU = 12, IPU = 18, CUSTOM = 20, + PrivateUse1 = CUSTOM, }; constexpr DeviceType kCUDA = DeviceType::CUDA; @@ -33,6 +36,7 @@ constexpr DeviceType kCPU = DeviceType::CPU; constexpr DeviceType kCUSTOM = DeviceType::CUSTOM; constexpr DeviceType kXPU = DeviceType::XPU; constexpr DeviceType kIPU = DeviceType::IPU; +constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1; inline phi::AllocationType DeviceTypeToPhi(DeviceType d) { switch (d) { @@ -103,12 +107,22 @@ inline std::ostream& operator<<(std::ostream& os, DeviceType d) { } // namespace c10 +namespace std { +template <> +struct hash { + std::size_t operator()(c10::DeviceType k) const noexcept { + return std::hash()(static_cast(k)); + } +}; +} // namespace std + namespace at { using c10::DeviceType; using c10::kCPU; using c10::kCUDA; using c10::kCUSTOM; using c10::kIPU; +using c10::kPrivateUse1; using c10::kXPU; } // namespace at @@ -118,5 +132,6 @@ using c10::kCPU; using c10::kCUDA; using c10::kCUSTOM; using c10::kIPU; +using c10::kPrivateUse1; using c10::kXPU; } // namespace torch diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h index 5d2c2d10b710d5..04ea3c7eaae0f1 100644 --- a/paddle/phi/api/include/compat/c10/core/Event.h +++ b/paddle/phi/api/include/compat/c10/core/Event.h @@ -20,38 +20,6 @@ #include namespace c10 { -/** - * A backend-generic movable, not copyable, not thread-safe event. - * - * The design of this event follows that of CUDA and HIP events. These events - * are recorded and waited on by streams and can be rerecorded to, - * each rerecording essentially creating a new version of the event. - * For example, if (in CPU time), stream X is asked to record E, - * stream Y waits on E, and stream X is asked to record E again, then Y will - * wait for X to finish the first call to record and not the second, because - * it's waiting on the first version of event E, not the second. - * Querying an event only returns the status of its most recent version. - * - * Backend-generic events are implemented by this class and - * impl::InlineEvent. In addition to these events there are also - * some backend-specific events, like ATen's CUDAEvent. Each of these - * classes has its own use. - * - * impl::InlineEvent<...> or a backend-specific event should be - * preferred when the backend is known at compile time and known to - * be compiled. Backend-specific events may have additional functionality. - * - * This Event should be used if a particular backend may not be available, - * or the backend required is not known at compile time. - * - * These generic events are built on top of DeviceGuardImpls, analogous - * to DeviceGuard and InlineDeviceGuard. The name "DeviceGuardImpls," - * is no longer entirely accurate, as these classes implement the - * backend-specific logic for a generic backend interface. - * - * See DeviceGuardImplInterface.h for a list of all supported flags. - */ - #ifdef PADDLE_WITH_CUDA class EventPool { @@ -136,12 +104,6 @@ struct Event final { void record(const c10::cuda::CUDAStream &stream) { record(stream.unwrap()); } - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // c10::Stream - void record(const cudaStream_t &stream) { - C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream)); - } - void block(const Stream &stream) const { C10_CUDA_CHECK(cudaStreamWaitEvent( static_cast(stream.native_handle()), cuda_event_, 0)); diff --git a/paddle/phi/api/include/compat/c10/core/ScalarType.h b/paddle/phi/api/include/compat/c10/core/ScalarType.h index 5495e655040dcb..0fe99f5d1a0f04 100644 --- a/paddle/phi/api/include/compat/c10/core/ScalarType.h +++ b/paddle/phi/api/include/compat/c10/core/ScalarType.h @@ -130,58 +130,15 @@ struct dummy_int1_7_t {}; _(uint32_t, UINT32, UInt32) enum class PADDLE_API ScalarType : int8_t { - Byte = 0, - Char = 1, - Short = 2, - Int = 3, - Long = 4, - Half = 5, - Float = 6, - Double = 7, - ComplexHalf = 8, - ComplexFloat = 9, - ComplexDouble = 10, - Bool = 11, - QInt8 = 12, - QUInt8 = 13, - QInt32 = 14, - BFloat16 = 15, - QUInt4x2 = 16, - QUInt2x4 = 17, - Bits1x8 = 18, - Bits2x4 = 19, - Bits4x2 = 20, - Bits8 = 21, - Bits16 = 22, - Float8_e5m2 = 23, - Float8_e4m3fn = 24, - Float8_e5m2fnuz = 25, - Float8_e4m3fnuz = 26, - UInt16 = 27, - UInt32 = 28, - UInt64 = 29, - UInt1 = 30, - UInt2 = 31, - UInt3 = 32, - UInt4 = 33, - UInt5 = 34, - UInt6 = 35, - UInt7 = 36, - Int1 = 37, - Int2 = 38, - Int3 = 39, - Int4 = 40, - Int5 = 41, - Int6 = 42, - Int7 = 43, - Float8_e8m0fnu = 44, - Float4_e2m1fn_x2 = 45, - Undefined = 46, - NumOptions = 47 +#define DEFINE_ST_ENUM_VAL_(_1, _2, n) n, + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ST_ENUM_VAL_) +#undef DEFINE_ENUM_ST_ENUM_VAL_ +#define DEFINE_ST_ENUM_VAL_FOR_QINTS_(_1, n) n, + AT_FORALL_QINT_TYPES(DEFINE_ST_ENUM_VAL_FOR_QINTS_) +#undef DEFINE_ST_ENUM_VAL_FOR_QINTS_ + Undefined, + NumOptions }; - -constexpr uint16_t NumScalarTypes = - static_cast(ScalarType::NumOptions); namespace impl { // These are used to map ScalarTypes to C++ types. @@ -281,38 +238,6 @@ inline const char* toString(ScalarType t) { switch (t) { AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE) - case ScalarType::QInt8: - return "QInt8"; - case ScalarType::QUInt8: - return "QUInt8"; - case ScalarType::QInt32: - return "QInt32"; - case ScalarType::QUInt4x2: - return "QUInt4x2"; - case ScalarType::QUInt2x4: - return "QUInt2x4"; - case ScalarType::ComplexHalf: - return "ComplexHalf"; - case ScalarType::Bits1x8: - return "Bits1x8"; - case ScalarType::Bits2x4: - return "Bits2x4"; - case ScalarType::Bits4x2: - return "Bits4x2"; - case ScalarType::Bits8: - return "Bits8"; - case ScalarType::Bits16: - return "Bits16"; - case ScalarType::Float8_e5m2fnuz: - return "Float8_e5m2fnuz"; - case ScalarType::Float8_e4m3fnuz: - return "Float8_e4m3fnuz"; - case ScalarType::Float8_e8m0fnu: - return "Float8_e8m0fnu"; - case ScalarType::Float4_e2m1fn_x2: - return "Float4_e2m1fn_x2"; - case ScalarType::Undefined: - return "Undefined"; default: return "UNKNOWN_SCALAR"; } @@ -326,18 +251,6 @@ inline size_t elementSize(ScalarType t) { switch (t) { AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(CASE_ELEMENTSIZE_CASE) - case ScalarType::QInt8: - case ScalarType::QUInt8: - case ScalarType::QUInt4x2: - case ScalarType::QUInt2x4: - case ScalarType::Bits1x8: - case ScalarType::Bits2x4: - case ScalarType::Bits4x2: - case ScalarType::Bits8: - return 1; - case ScalarType::QInt32: - case ScalarType::Bits16: - return 4; default: TORCH_CHECK(false, "Unknown ScalarType"); } @@ -410,7 +323,6 @@ inline bool isSignedType(ScalarType t) { // Complex types (treated as signed) case ScalarType::ComplexFloat: case ScalarType::ComplexDouble: - case ScalarType::ComplexHalf: return true; // Signed quantized types (explicitly return true) @@ -438,22 +350,11 @@ inline bool isSignedType(ScalarType t) { case ScalarType::QUInt8: case ScalarType::QUInt4x2: case ScalarType::QUInt2x4: - case ScalarType::Bits1x8: - case ScalarType::Bits2x4: - case ScalarType::Bits4x2: - case ScalarType::Bits8: - case ScalarType::Bits16: return false; // Bool is unsigned (using numeric_limits) CASE_ISSIGNED(Bool); - case ScalarType::Float8_e5m2fnuz: - case ScalarType::Float8_e4m3fnuz: - case ScalarType::Float8_e8m0fnu: - case ScalarType::Float4_e2m1fn_x2: - return true; - // Invalid/undefined types - should not happen in normal usage // If this is hit, it indicates a programming error or unsupported type case ScalarType::Undefined: diff --git a/paddle/phi/api/include/compat/c10/core/Stream.cpp b/paddle/phi/api/include/compat/c10/core/Stream.cpp index 9a52b8c9f9f05d..60873f6e05a93c 100644 --- a/paddle/phi/api/include/compat/c10/core/Stream.cpp +++ b/paddle/phi/api/include/compat/c10/core/Stream.cpp @@ -44,9 +44,11 @@ void* Stream::native_handle() const { return reinterpret_cast(static_cast(id_)); } #endif - PADDLE_THROW(::common::errors::Unimplemented( - "c10::Stream::native_handle() is not supported for device type %d", - static_cast(device_type()))); + // Match PyTorch error message format for unsupported device types + PD_CHECK(false, + "native_handle() is not supported for this device type (", + static_cast(device_type()), + ")"); } bool Stream::query() const { diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h index 58912130daf303..f68e863eb931dd 100644 --- a/paddle/phi/api/include/compat/c10/core/Stream.h +++ b/paddle/phi/api/include/compat/c10/core/Stream.h @@ -90,9 +90,8 @@ class Stream final { }; inline std::ostream& operator<<(std::ostream& os, const Stream& s) { - os << "Stream(device_type=" << static_cast(s.device_type()) - << ", device_index=" << static_cast(s.device_index()) - << ", id=" << s.id() << ")"; + // Format: "stream {id} on device {device_type}:{device_index}" + os << "stream " << s.id() << " on device " << s.device(); return os; } @@ -106,3 +105,7 @@ struct hash { } }; } // namespace std + +namespace at { +using c10::Stream; +} diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp new file mode 100644 index 00000000000000..482bf7036d7cea --- /dev/null +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.cpp @@ -0,0 +1,44 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +namespace c10::cuda { + +c10::DeviceIndex device_count() { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + return phi::backends::gpu::GetGPUDeviceCount(); +#else + // Return 0 instead of throwing to match PyTorch API semantics + // at::cuda::is_available() relies on this returning 0/false + return 0; +#endif +} + +void device_synchronize() { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + int curr_device_id = paddle::platform::GetCurrentDeviceId(); + paddle::platform::SetDeviceId(curr_device_id); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); +#else + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#endif +#else + PADDLE_THROW(common::errors::Unavailable( + "Paddle is not compiled with CUDA. Cannot visit device synchronize.")); +#endif +} + +} // namespace c10::cuda diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h index 086c3f486cc6ae..25a0301783b606 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h @@ -18,37 +18,21 @@ namespace c10::cuda { -inline c10::DeviceIndex device_count() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return phi::backends::gpu::GetGPUDeviceCount(); -#else - // Return 0 instead of throwing to match PyTorch API semantics - // at::cuda::is_available() relies on this returning 0/false - return 0; -#endif -} +c10::DeviceIndex device_count(); -inline void device_synchronize() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - int curr_device_id = paddle::platform::GetCurrentDeviceId(); - paddle::platform::SetDeviceId(curr_device_id); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); -#else - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#endif -#else - PADDLE_THROW(common::errors::Unavailable( - "Paddle is not compiled with CUDA. Cannot visit device synchronize.")); -#endif -} +void device_synchronize(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void __inline__ stream_synchronize(gpuStream_t stream) { phi::backends::gpu::GpuStreamSync(stream); } +#endif + } // namespace c10::cuda namespace at::cuda { using c10::cuda::device_synchronize; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) using c10::cuda::stream_synchronize; +#endif } // namespace at::cuda diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h b/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h index ce819e69e64932..8f2193ef46ae0e 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAGuard.h @@ -19,16 +19,43 @@ #pragma once #include +#include + #include + #include "paddle/phi/core/platform/cuda_device_guard.h" namespace c10::cuda { + +namespace detail { + +inline Device current_cuda_device() { + return Device(kCUDA, phi::backends::gpu::GetCurrentDeviceId()); +} + +inline Device normalize_cuda_device(Device device) { + TORCH_CHECK(device.is_cuda(), "Expected a CUDA device, but got ", device); + return Device(kCUDA, device.has_index() ? device.index() : 0); +} + +} // namespace detail + struct CUDAGuard { explicit CUDAGuard() = delete; // NOLINT - explicit CUDAGuard(DeviceIndex device_index) : guard_(device_index) {} + explicit CUDAGuard(DeviceIndex device_index) + : original_device_(detail::current_cuda_device()), + current_device_(original_device_), + guard_() { + set_index(device_index); + } - explicit CUDAGuard(Device device) : guard_(device._PD_GetInner()) {} + explicit CUDAGuard(Device device) + : original_device_(detail::current_cuda_device()), + current_device_(original_device_), + guard_() { + set_device(device); + } CUDAGuard(const CUDAGuard&) = delete; CUDAGuard& operator=(const CUDAGuard&) = delete; @@ -37,79 +64,86 @@ struct CUDAGuard { CUDAGuard& operator=(CUDAGuard&& other) = delete; ~CUDAGuard() = default; - void set_device(Device device) { guard_.SetDevice(device._PD_GetInner()); } + void set_device(Device device) { + current_device_ = detail::normalize_cuda_device(device); + guard_.SetDevice(current_device_._PD_GetInner()); + } void reset_device(Device device) { set_device(device); } void set_index(DeviceIndex device_index) { + current_device_ = Device(kCUDA, device_index); guard_.SetDeviceIndex(device_index); } - Device current_device() const { - return c10::Device(c10::kCUDA, phi::backends::gpu::GetCurrentDeviceId()); - } + Device original_device() const { return original_device_; } + + Device current_device() const { return current_device_; } private: + Device original_device_; + Device current_device_; paddle::platform::CUDADeviceGuard guard_; }; struct OptionalCUDAGuard { OptionalCUDAGuard() = default; - explicit OptionalCUDAGuard(std::optional device_opt) : guard_() { + explicit OptionalCUDAGuard(std::optional device_opt) { if (device_opt.has_value()) { - guard_.emplace(device_opt.value()._PD_GetInner()); + set_device(device_opt.value()); } } - explicit OptionalCUDAGuard(std::optional device_index_opt) - : guard_() { + explicit OptionalCUDAGuard(std::optional device_index_opt) { if (device_index_opt.has_value()) { - guard_.emplace(device_index_opt.value()); + set_index(device_index_opt.value()); } } - // Copy is not allowed OptionalCUDAGuard(const OptionalCUDAGuard&) = delete; OptionalCUDAGuard& operator=(const OptionalCUDAGuard&) = delete; OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete; - OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete; ~OptionalCUDAGuard() = default; void set_device(Device device) { - if (!guard_.has_value()) { - guard_.emplace(device._PD_GetInner()); - } else { - guard_->SetDevice(device._PD_GetInner()); - } + const Device normalized = detail::normalize_cuda_device(device); + init_if_needed(); + guard_->SetDevice(normalized._PD_GetInner()); + current_device_ = normalized; } - void reset_device(Device device) { - if (!guard_.has_value()) { - guard_.emplace(device._PD_GetInner()); - } else { - guard_->SetDevice(device._PD_GetInner()); - } - } + void reset_device(Device device) { set_device(device); } void set_index(DeviceIndex device_index) { - if (!guard_.has_value()) { - guard_.emplace(device_index); - } else { - guard_->SetDeviceIndex(device_index); - } + init_if_needed(); + guard_->SetDeviceIndex(device_index); + current_device_ = Device(kCUDA, device_index); } - std::optional current_device() const { - return guard_.has_value() - ? std::make_optional(c10::Device( - c10::kCUDA, phi::backends::gpu::GetCurrentDeviceId())) - : std::nullopt; + std::optional original_device() const { return original_device_; } + + std::optional current_device() const { return current_device_; } + + void reset() { + guard_.reset(); + original_device_.reset(); + current_device_.reset(); } private: + void init_if_needed() { + if (!guard_.has_value()) { + original_device_ = detail::current_cuda_device(); + current_device_ = original_device_; + guard_.emplace(); + } + } + + std::optional original_device_; + std::optional current_device_; std::optional guard_; }; diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h index f88dd043317252..8f55bdef3f9a21 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h @@ -18,9 +18,14 @@ #include #include +#include #include #include +#include #include +#include +#include + #include "paddle/phi/api/include/context_pool.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/place.h" @@ -30,16 +35,13 @@ namespace c10::cuda { using StreamId = int64_t; -// ── Per-device stream pool and per-thread current stream ───────────────────── +static constexpr int max_compile_time_stream_priorities = 4; namespace detail { constexpr int kStreamsPerPool = 32; -// Upper bound for static pool/TLS arrays. 64 covers all current CUDA hardware. constexpr int kMaxDevices = 64; -// Device count is invariant after CUDA initialization; cache it to avoid -// repeated driver calls on the hot path. inline int gpu_device_count() { static const int count = phi::backends::gpu::GetGPUDeviceCount(); return count; @@ -74,7 +76,7 @@ inline StreamPoolState& get_pool(int device_index) { inline void init_pool(int device_index, StreamPoolState* state) { phi::backends::gpu::GPUDeviceGuard guard(device_index); int lo_pri = 0, hi_pri = 0; - cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri); + C10_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri)); for (int i = 0; i < kStreamsPerPool; ++i) { C10_CUDA_CHECK(cudaStreamCreateWithPriority( &state->low_priority[i], cudaStreamNonBlocking, lo_pri)); @@ -83,9 +85,6 @@ inline void init_pool(int device_index, StreamPoolState* state) { } } -// Per-thread, per-device current stream state. -// thread_local inside an inline function is ODR-safe across translation units -// (C++11 §3.2): all TUs share the same thread-local instance per thread. struct TLSStreamState { cudaStream_t streams[kMaxDevices]{}; bool has_stream[kMaxDevices]{}; @@ -98,22 +97,24 @@ inline TLSStreamState& get_tls() { } // namespace detail -// ── CUDAStream ─────────────────────────────────────────────────────────────── - class CUDAStream { public: + enum Unchecked { UNCHECKED }; + CUDAStream() = delete; explicit CUDAStream(Stream stream) : stream_(stream) { TORCH_CHECK(stream_.device_type() == DeviceType::CUDA); } + explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {} + bool operator==(const CUDAStream& other) const noexcept { - return stream_ == other.stream_; + return unwrap() == other.unwrap(); } bool operator!=(const CUDAStream& other) const noexcept { - return stream_ != other.stream_; + return unwrap() != other.unwrap(); } StreamId id() const { return stream_.id(); } @@ -122,6 +123,21 @@ class CUDAStream { operator Stream() const { return unwrap(); } + bool query() const { return unwrap().query(); } + + void synchronize() const { unwrap().synchronize(); } + + int priority() const { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + phi::backends::gpu::GPUDeviceGuard guard(device_index()); + int priority = 0; + C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority)); + return priority; +#else + return 0; +#endif + } + cudaStream_t stream() const { return reinterpret_cast(stream_.id()); } @@ -134,17 +150,34 @@ class CUDAStream { Device device() const { return Device(DeviceType::CUDA, device_index()); } - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // stream() - cudaStream_t raw_stream() const { return stream(); } + struct c10::StreamData3 pack3() const { + return stream_.pack3(); + } + + static CUDAStream unpack3(StreamId stream_id, + DeviceIndex device_index, + DeviceType device_type) { + return CUDAStream(Stream::unpack3(stream_id, device_index, device_type)); + } + + static std::tuple priority_range() { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + int least_priority = 0; + int greatest_priority = 0; + C10_CUDA_CHECK( + cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority)); + greatest_priority = + std::max(-max_compile_time_stream_priorities + 1, greatest_priority); + return std::make_tuple(least_priority, greatest_priority); +#else + return std::make_tuple(0, 0); +#endif + } private: Stream stream_; }; -// Build a CUDAStream from a raw platform stream handle and a device index. -// The handle is encoded as a StreamId via reinterpret_cast, matching Paddle's -// phi::Stream / phi::CUDAStream convention. inline CUDAStream make_cuda_stream(cudaStream_t raw, c10::DeviceIndex device_index) { c10::StreamId sid = @@ -155,14 +188,6 @@ inline CUDAStream make_cuda_stream(cudaStream_t raw, sid)); } -/** - * Get the current CUDA stream for the given device (or the current device if - * device_index == -1). - * - * Returns the per-thread current stream if one has been set via - * setCurrentCUDAStream() for this thread and device; otherwise falls back to - * Paddle's default stream for the device. - */ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) { if (device_index == -1) { device_index = phi::backends::gpu::GetCurrentDeviceId(); @@ -181,29 +206,21 @@ inline CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1) { return make_cuda_stream(raw, device_index); } -/** - * Get a stream from the per-device pool in round-robin fashion. - * Returns a high priority stream if isHighPriority is true. - * - * The pool is lazily initialized on first use for each device. Each device - * has kStreamsPerPool low-priority and kStreamsPerPool high-priority streams - * that are reused round-robin. Pool streams are always distinct from the - * current stream, enabling cross-stream dependency management and correct - * record_stream lifetime semantics. - */ inline CUDAStream getStreamFromPool(const bool isHighPriority = false, + c10::DeviceIndex device_index = -1); + +inline CUDAStream getStreamFromPool(const int priority, c10::DeviceIndex device_index = -1) { if (device_index == -1) { device_index = phi::backends::gpu::GetCurrentDeviceId(); } - // get_pool also performs bounds-checking on device_index. auto& state = detail::get_pool(device_index); std::call_once(state.init_flag, [device_index, &state]() { detail::init_pool(device_index, &state); }); cudaStream_t raw; - if (isHighPriority) { + if (priority < 0) { raw = state.high_priority[state.hp_counter.fetch_add(1) % detail::kStreamsPerPool]; } else { @@ -213,14 +230,17 @@ inline CUDAStream getStreamFromPool(const bool isHighPriority = false, return make_cuda_stream(raw, device_index); } -/** - * Set the current CUDA stream for the device of the given stream in the - * calling thread. - * - * Implements per-thread, per-device current stream semantics: the change is - * local to the calling OS thread and does not affect any shared state such as - * Paddle's GPUContext. Other threads continue to see their own current stream. - */ +inline CUDAStream getStreamFromPool(const bool isHighPriority, + c10::DeviceIndex device_index) { + return getStreamFromPool(isHighPriority ? -1 : 0, device_index); +} + +inline CUDAStream getStreamFromExternal(cudaStream_t ext_stream, + c10::DeviceIndex device_index) { + detail::check_device_index(device_index); + return make_cuda_stream(ext_stream, device_index); +} + inline void setCurrentCUDAStream(CUDAStream stream) { c10::DeviceIndex idx = stream.unwrap().device_index(); detail::check_device_index(idx); @@ -234,22 +254,30 @@ inline CUDAStream getDefaultCUDAStream(c10::DeviceIndex device_index = -1) { device_index = phi::backends::gpu::GetCurrentDeviceId(); } detail::check_device_index(device_index); - // The default CUDA stream is always the null stream (cudaStreamDefault, - // handle == 0), regardless of any per-thread current stream override. - // This matches PyTorch semantics where getDefaultCUDAStream() returns the - // fixed device-level default stream, while getCurrentCUDAStream() returns - // the per-thread current stream (which may differ after - // setCurrentCUDAStream). return CUDAStream(c10::Stream( c10::Stream::DEFAULT, c10::Device(c10::DeviceType::CUDA, device_index))); } +inline std::ostream& operator<<(std::ostream& stream, const CUDAStream& s) { + return stream << s.unwrap(); +} + } // namespace c10::cuda +namespace std { +template <> +struct hash { + size_t operator()(c10::cuda::CUDAStream s) const noexcept { + return std::hash{}(s.unwrap()); + } +}; +} // namespace std + namespace at::cuda { using c10::cuda::CUDAStream; using c10::cuda::getCurrentCUDAStream; using c10::cuda::getDefaultCUDAStream; +using c10::cuda::getStreamFromExternal; using c10::cuda::getStreamFromPool; using c10::cuda::setCurrentCUDAStream; } // namespace at::cuda diff --git a/paddle/phi/api/include/compat/c10/util/Exception.h b/paddle/phi/api/include/compat/c10/util/Exception.h index 70c8f656417296..2651c83646044c 100644 --- a/paddle/phi/api/include/compat/c10/util/Exception.h +++ b/paddle/phi/api/include/compat/c10/util/Exception.h @@ -34,16 +34,16 @@ namespace c10 { #define TORCH_CHECK(COND, ...) PD_CHECK(COND, ##__VA_ARGS__); #define TORCH_INTERNAL_ASSERT(COND, ...) PD_CHECK(COND, ##__VA_ARGS__); -#define TORCH_CHECK_OP(val1, val2, op) \ - do { \ - auto&& _val1 = (val1); \ - auto&& _val2 = (val2); \ - if (!(_val1 op _val2)) { \ - std::ostringstream _result; \ - _result << "Expected " #val1 " " #op " " #val2 " (" << _val1 << " " \ - << #op << " " << _val2 << "), but got false"; \ - PD_THROW(_result.str()); \ - } \ +#define TORCH_CHECK_OP(val1, val2, op) \ + do { \ + auto&& _val1 = (val1); \ + auto&& _val2 = (val2); \ + if (!(_val1 op _val2)) { \ + std::ostringstream _result; \ + _result << "Check failed: " #val1 " " #op " " #val2 " (" << _val1 \ + << " vs. " << _val2 << "). "; \ + PD_THROW(_result.str()); \ + } \ } while (false); // Check for a given boolean condition. diff --git a/paddle/phi/api/include/compat/torch/csrc/api/include/torch/cuda.h b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/cuda.h index 3cf18fd4f22574..4eb38ceecc681f 100644 --- a/paddle/phi/api/include/compat/torch/csrc/api/include/torch/cuda.h +++ b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/cuda.h @@ -28,7 +28,5 @@ void synchronize(int64_t device_index = -1); } // namespace torch::cuda namespace at::cuda { -using torch::cuda::device_count; -using torch::cuda::is_available; using torch::cuda::synchronize; } // namespace at::cuda diff --git a/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h new file mode 100644 index 00000000000000..f745837944df17 --- /dev/null +++ b/paddle/phi/api/include/compat/torch/csrc/api/include/torch/version.h @@ -0,0 +1,39 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +/// Indicates the major version of LibTorch. +#define TORCH_VERSION_MAJOR 2 + +/// Indicates the minor version of LibTorch. +#define TORCH_VERSION_MINOR 10 + +/// Indicates the patch version of LibTorch. +#define TORCH_VERSION_PATCH 0 + +/// Indicates the ABI version tag of LibTorch. +#define TORCH_VERSION_ABI_TAG 0 + +/// Indicates the version of LibTorch as a string literal. +#define TORCH_VERSION "2.10.0" + +/// Indicates the ABI version of LibTorch as a single uint64. +/// [ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ] +/// [ MAJ ][ MIN ][ PATCH][ ABI TAG ] +#define TORCH_ABI_VERSION \ + (((0ULL + TORCH_VERSION_MAJOR) << 56) | \ + ((0ULL + TORCH_VERSION_MINOR) << 48) | /* NOLINT(whitespace/indent) */ \ + ((0ULL + TORCH_VERSION_PATCH) << 40) | /* NOLINT(whitespace/indent) */ \ + ((0ULL + TORCH_VERSION_ABI_TAG) << 0)) /* NOLINT(whitespace/indent) */ diff --git a/test/cpp/compat/ATen_TensorAccessor_test.cc b/test/cpp/compat/ATen_TensorAccessor_test.cc index cb1eaaf3c8add0..4d6b8e9648bcfa 100644 --- a/test/cpp/compat/ATen_TensorAccessor_test.cc +++ b/test/cpp/compat/ATen_TensorAccessor_test.cc @@ -198,7 +198,7 @@ TEST(TensorAccessorTest, PackedAccessorWithIntType) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(TensorAccessorTest, PackedAccessorCUDA) { - if (at::cuda::is_available()) { + if (torch::cuda::is_available()) { // Create CUDA tensor at::Tensor tensor = at::arange(12, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)) diff --git a/test/cpp/compat/ATen_factory_default_dtype_test.cc b/test/cpp/compat/ATen_factory_default_dtype_test.cc index 70dd3ab02cdeeb..08cc8d4c2d9a3d 100644 --- a/test/cpp/compat/ATen_factory_default_dtype_test.cc +++ b/test/cpp/compat/ATen_factory_default_dtype_test.cc @@ -51,19 +51,59 @@ TEST(ATenFactoryDefaultDtypeTest, EmptyNulloptDtypeUsesCurrentDefault) { ASSERT_EQ(tensor.sizes(), c10::IntArrayRef({2, 3})); } -TEST(ATenFactoryDefaultDtypeTest, ArangeNulloptDtypeUsesCurrentDefault) { +TEST(ATenFactoryDefaultDtypeTest, ArangeOmittedDtypeUsesLongForIntegralInputs) { DefaultDtypeGuard guard(at::kDouble); - at::Tensor end_only = + at::Tensor end_only_default = at::arange(5); + at::Tensor start_end_default = at::arange(1, 6); + at::Tensor start_end_step_default = at::arange(1, 7, 2); + at::Tensor end_only_nullopt = at::arange(5, std::nullopt, std::nullopt, at::kCPU, false); - at::Tensor start_end = + at::Tensor start_end_nullopt = at::arange(1, 6, std::nullopt, std::nullopt, at::kCPU, false); - at::Tensor start_end_step = + at::Tensor start_end_step_nullopt = at::arange(1, 7, 2, std::nullopt, std::nullopt, at::kCPU, false); - ASSERT_EQ(end_only.scalar_type(), at::kDouble); - ASSERT_EQ(start_end.scalar_type(), at::kDouble); - ASSERT_EQ(start_end_step.scalar_type(), at::kDouble); + ASSERT_EQ(end_only_default.scalar_type(), at::kLong); + ASSERT_EQ(start_end_default.scalar_type(), at::kLong); + ASSERT_EQ(start_end_step_default.scalar_type(), at::kLong); + ASSERT_EQ(end_only_nullopt.scalar_type(), at::kLong); + ASSERT_EQ(start_end_nullopt.scalar_type(), at::kLong); + ASSERT_EQ(start_end_step_nullopt.scalar_type(), at::kLong); + ASSERT_EQ(end_only_default.data_ptr()[4], 4); + ASSERT_EQ(start_end_default.data_ptr()[0], 1); + ASSERT_EQ(start_end_step_default.data_ptr()[2], 5); + ASSERT_EQ(end_only_nullopt.data_ptr()[4], 4); + ASSERT_EQ(start_end_nullopt.data_ptr()[0], 1); + ASSERT_EQ(start_end_step_nullopt.data_ptr()[2], 5); +} + +TEST(ATenFactoryDefaultDtypeTest, + ArangeOmittedDtypeUsesCurrentDefaultForFloatingInputs) { + DefaultDtypeGuard guard(at::kDouble); + + at::Tensor end_only_default = at::arange(5.0); + at::Tensor start_end_default = at::arange(1.0, 6.0); + at::Tensor start_end_step_default = at::arange(1.0, 7.0, 2.0); + at::Tensor end_only_nullopt = + at::arange(5.0, std::nullopt, std::nullopt, at::kCPU, false); + at::Tensor start_end_nullopt = + at::arange(1.0, 6.0, std::nullopt, std::nullopt, at::kCPU, false); + at::Tensor start_end_step_nullopt = + at::arange(1.0, 7.0, 2.0, std::nullopt, std::nullopt, at::kCPU, false); + + ASSERT_EQ(end_only_default.scalar_type(), at::kDouble); + ASSERT_EQ(start_end_default.scalar_type(), at::kDouble); + ASSERT_EQ(start_end_step_default.scalar_type(), at::kDouble); + ASSERT_EQ(end_only_nullopt.scalar_type(), at::kDouble); + ASSERT_EQ(start_end_nullopt.scalar_type(), at::kDouble); + ASSERT_EQ(start_end_step_nullopt.scalar_type(), at::kDouble); + ASSERT_DOUBLE_EQ(end_only_default.data_ptr()[4], 4.0); + ASSERT_DOUBLE_EQ(start_end_default.data_ptr()[0], 1.0); + ASSERT_DOUBLE_EQ(start_end_step_default.data_ptr()[2], 5.0); + ASSERT_DOUBLE_EQ(end_only_nullopt.data_ptr()[4], 4.0); + ASSERT_DOUBLE_EQ(start_end_nullopt.data_ptr()[0], 1.0); + ASSERT_DOUBLE_EQ(start_end_step_nullopt.data_ptr()[2], 5.0); } TEST(ATenFactoryDefaultDtypeTest, FullNulloptDtypeUsesCurrentDefault) { diff --git a/test/cpp/compat/ATen_pin_memory_creation_test.cc b/test/cpp/compat/ATen_pin_memory_creation_test.cc index 4ead12a484f79f..8d89d00b07b083 100644 --- a/test/cpp/compat/ATen_pin_memory_creation_test.cc +++ b/test/cpp/compat/ATen_pin_memory_creation_test.cc @@ -123,8 +123,16 @@ TEST(ATenPinMemoryCreationTest, EyePinMemoryWithCUDADeviceErrors) { std::exception); } -TEST(ATenPinMemoryCreationTest, ArangePinMemory) { +TEST(ATenPinMemoryCreationTest, ArangePinMemoryOverloads) { SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + auto end_only_by_options = + at::arange(10, at::TensorOptions().dtype(at::kFloat).pinned_memory(true)); + AssertPinned(end_only_by_options); + + auto end_only_by_args = + at::arange(10, at::kFloat, std::nullopt, at::kCPU, true); + AssertPinned(end_only_by_args); + auto by_options = at::arange( 0, 10, at::TensorOptions().dtype(at::kFloat).pinned_memory(true)); AssertPinned(by_options); @@ -132,14 +140,30 @@ TEST(ATenPinMemoryCreationTest, ArangePinMemory) { auto by_args = at::arange(0, 10, at::kFloat, std::nullopt, at::kCPU, true); AssertPinned(by_args); + auto step_by_options = at::arange( + 0, 10, 2, at::TensorOptions().dtype(at::kFloat).pinned_memory(true)); + AssertPinned(step_by_options); + + auto step_by_args = + at::arange(0, 10, 2, at::kFloat, std::nullopt, at::kCPU, true); + AssertPinned(step_by_args); + auto no_pin = at::arange(0, 10, at::kFloat, std::nullopt, at::kCPU, false); AssertNotPinned(no_pin); + + auto step_no_pin = + at::arange(0, 10, 2, at::kFloat, std::nullopt, at::kCPU, false); + AssertNotPinned(step_no_pin); } TEST(ATenPinMemoryCreationTest, ArangePinMemoryWithCUDADeviceErrors) { SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); + ASSERT_THROW(at::arange(10, at::kFloat, std::nullopt, at::kCUDA, true), + std::exception); ASSERT_THROW(at::arange(0, 10, at::kFloat, std::nullopt, at::kCUDA, true), std::exception); + ASSERT_THROW(at::arange(0, 10, 2, at::kFloat, std::nullopt, at::kCUDA, true), + std::exception); } TEST(ATenPinMemoryCreationTest, EmptyLikePinMemory) { diff --git a/test/cpp/compat/ATen_resize_test.cc b/test/cpp/compat/ATen_resize_test.cc index 0ad6b09f58eb2b..e39fc40c58a176 100644 --- a/test/cpp/compat/ATen_resize_test.cc +++ b/test/cpp/compat/ATen_resize_test.cc @@ -24,8 +24,8 @@ #include "torch/all.h" // ======================== resize_ tests ======================== -// Note: Paddle's resize_ is implemented via reshape, which requires -// total element count to remain unchanged. +// Note: compat resize_ uses reshape when numel is unchanged, and falls back to +// set_ for storage-changing cases so repeated resize_ calls remain stable. TEST(TensorResizeTest, ResizeBasic) { // Create a 2x3 tensor @@ -109,6 +109,34 @@ TEST(TensorResizeTest, ResizePreservesData) { ASSERT_FLOAT_EQ(data[5], 5.0f); } +TEST(TensorResizeTest, ResizeShrinkDifferentNumel) { + at::Tensor t = at::arange(24, at::kFloat).reshape({2, 3, 4}); + + t.resize_({4, 5}); + + ASSERT_EQ(t.sizes()[0], 4); + ASSERT_EQ(t.sizes()[1], 5); + + float* data = t.data_ptr(); + for (int i = 0; i < 20; ++i) { + ASSERT_FLOAT_EQ(data[i], static_cast(i)); + } +} + +TEST(TensorResizeTest, ResizeGrowDifferentNumelPreservesPrefix) { + at::Tensor t = at::arange(6, at::kFloat).reshape({2, 3}); + + t.resize_({2, 5}); + + ASSERT_EQ(t.sizes()[0], 2); + ASSERT_EQ(t.sizes()[1], 5); + + float* data = t.data_ptr(); + for (int i = 0; i < 6; ++i) { + ASSERT_FLOAT_EQ(data[i], static_cast(i)); + } +} + TEST(TensorResizeTest, ResizeReturnReference) { // Create a tensor at::Tensor t = at::zeros({2, 3}); diff --git a/test/cpp/compat/c10_Device_test.cc b/test/cpp/compat/c10_Device_test.cc index 08331a68162194..04dc88b4d0e2e6 100644 --- a/test/cpp/compat/c10_Device_test.cc +++ b/test/cpp/compat/c10_Device_test.cc @@ -16,6 +16,7 @@ #include #include +#include #include "gtest/gtest.h" @@ -68,6 +69,9 @@ TEST(DeviceTypeCompatTest, DeviceTypeConversionAndStreamOperator) { std::ostringstream invalid_os; invalid_os << static_cast(99); EXPECT_TRUE(invalid_os.str().empty()); + + EXPECT_EQ(c10::DeviceType::PrivateUse1, c10::DeviceType::CUSTOM); + EXPECT_EQ(c10::kPrivateUse1, c10::DeviceType::PrivateUse1); } TEST(DeviceCompatTest, DeviceParseAndPlaceBranches) { @@ -122,3 +126,47 @@ TEST(DeviceCompatTest, DeviceParseAndPlaceBranches) { os << cuda; EXPECT_EQ(os.str(), "cuda:3"); } + +TEST(DeviceCompatTest, DeviceInterfaceParity) { + c10::Device cpu(c10::kCPU); + c10::Device cuda(c10::kCUDA, 0); + c10::Device xpu(c10::kXPU, 1); + c10::Device ipu(c10::kIPU, 2); + c10::Device privateuse(c10::kPrivateUse1, 4); + + EXPECT_TRUE(cpu.is_cpu()); + EXPECT_TRUE(cuda.is_cuda()); + EXPECT_TRUE(xpu.is_xpu()); + EXPECT_TRUE(ipu.is_ipu()); + EXPECT_TRUE(privateuse.is_privateuseone()); + EXPECT_FALSE(privateuse.is_mps()); + EXPECT_FALSE(privateuse.is_hip()); + EXPECT_FALSE(privateuse.is_meta()); + EXPECT_TRUE(cpu.supports_as_strided()); + EXPECT_FALSE(ipu.supports_as_strided()); + + c10::Device cpu_with_index(c10::kCPU); + cpu_with_index.set_index(0); + EXPECT_EQ(cpu_with_index.index(), 0); + EXPECT_EQ(cpu_with_index.str(), "cpu:0"); + + c10::Device cuda_with_index(c10::kCUDA); + cuda_with_index.set_index(2); + EXPECT_EQ(cuda_with_index.index(), 2); + EXPECT_EQ(cuda_with_index.str(), "cuda:2"); + + EXPECT_NE(cpu, cuda); + EXPECT_EQ(cuda, c10::Device(c10::kCUDA, 0)); + EXPECT_EQ(privateuse.str(), "privateuseone:4"); + EXPECT_TRUE(c10::Device("privateuseone:7").is_privateuseone()); + + EXPECT_THROW(c10::Device("cuda:-1"), ::std::exception); + EXPECT_THROW(c10::Device("cuda:01"), ::std::exception); + EXPECT_THROW(c10::Device("cuda:1:2"), ::std::exception); + + std::unordered_map device_map; + device_map.emplace(c10::Device(c10::kCUDA, 0), 7); + device_map.emplace(c10::Device(c10::kCPU), 3); + EXPECT_EQ(device_map.at(c10::Device(c10::kCUDA, 0)), 7); + EXPECT_EQ(device_map.at(c10::Device(c10::kCPU)), 3); +} diff --git a/test/cpp/compat/c10_ScalarType_test.cc b/test/cpp/compat/c10_ScalarType_test.cc index ea0895412f72df..a373ea81841047 100644 --- a/test/cpp/compat/c10_ScalarType_test.cc +++ b/test/cpp/compat/c10_ScalarType_test.cc @@ -25,7 +25,6 @@ #include #include #endif -#include #include "ATen/ATen.h" #include "gtest/gtest.h" #include "paddle/phi/common/float16.h" @@ -91,78 +90,3 @@ TEST(TensorBaseTest, TypeCheckingAPIs) { ASSERT_FALSE(uint8_tensor.is_signed()); ASSERT_FALSE(bool_tensor.is_signed()); } - -TEST(ScalarTypeCompatTest, ScalarTypeUtilityBranches) { - EXPECT_STREQ(c10::toString(c10::ScalarType::Bits1x8), "Bits1x8"); - EXPECT_STREQ(c10::toString(c10::ScalarType::Bits16), "Bits16"); - EXPECT_STREQ(c10::toString(c10::ScalarType::Float8_e5m2fnuz), - "Float8_e5m2fnuz"); - EXPECT_STREQ(c10::toString(c10::ScalarType::Float8_e4m3fnuz), - "Float8_e4m3fnuz"); - EXPECT_STREQ(c10::toString(c10::ScalarType::Float8_e8m0fnu), - "Float8_e8m0fnu"); - EXPECT_STREQ(c10::toString(c10::ScalarType::Float4_e2m1fn_x2), - "Float4_e2m1fn_x2"); - EXPECT_STREQ(c10::toString(c10::ScalarType::Undefined), "Undefined"); - EXPECT_STREQ(c10::toString(static_cast(-1)), - "UNKNOWN_SCALAR"); - - EXPECT_EQ(c10::elementSize(c10::ScalarType::QInt8), static_cast(1)); - EXPECT_EQ(c10::elementSize(c10::ScalarType::QUInt4x2), - static_cast(1)); - EXPECT_EQ(c10::elementSize(c10::ScalarType::QInt32), static_cast(4)); - EXPECT_EQ(c10::elementSize(c10::ScalarType::Bits16), static_cast(4)); - EXPECT_THROW(c10::elementSize(c10::ScalarType::Undefined), ::std::exception); - - EXPECT_TRUE(c10::isIntegralType(c10::ScalarType::Bool, true)); - EXPECT_FALSE(c10::isIntegralType(c10::ScalarType::Bool, false)); - EXPECT_TRUE(c10::isFloat8Type(c10::ScalarType::Float8_e5m2)); - EXPECT_FALSE(c10::isFloat8Type(c10::ScalarType::Float8_e4m3fnuz)); - EXPECT_TRUE(c10::isReducedFloatingType(c10::ScalarType::BFloat16)); - EXPECT_TRUE(c10::isFloatingType(c10::ScalarType::Float)); - EXPECT_FALSE(c10::isComplexType(c10::ScalarType::ComplexHalf)); - - EXPECT_TRUE(c10::isSignedType(c10::ScalarType::Int1)); - EXPECT_FALSE(c10::isSignedType(c10::ScalarType::UInt3)); - EXPECT_FALSE(c10::isSignedType(c10::ScalarType::QUInt8)); - EXPECT_TRUE(c10::isSignedType(c10::ScalarType::Float8_e5m2fnuz)); - EXPECT_THROW(c10::isSignedType(c10::ScalarType::Undefined), ::std::exception); - - std::ostringstream oss; - oss << c10::ScalarType::UInt7; - EXPECT_EQ(oss.str(), "UInt7"); -} - -TEST(ScalarTypeCompatTest, AdditionalEnumAndPredicateBranches) { - EXPECT_STREQ(c10::toString(c10::ScalarType::QInt8), "QInt8"); - EXPECT_STREQ(c10::toString(c10::ScalarType::QUInt8), "QUInt8"); - EXPECT_STREQ(c10::toString(c10::ScalarType::QInt32), "QInt32"); - EXPECT_STREQ(c10::toString(c10::ScalarType::QUInt4x2), "QUInt4x2"); - EXPECT_STREQ(c10::toString(c10::ScalarType::QUInt2x4), "QUInt2x4"); - EXPECT_STREQ(c10::toString(c10::ScalarType::ComplexHalf), "ComplexHalf"); - EXPECT_STREQ(c10::toString(c10::ScalarType::Bits2x4), "Bits2x4"); - EXPECT_STREQ(c10::toString(c10::ScalarType::Bits4x2), "Bits4x2"); - EXPECT_STREQ(c10::toString(c10::ScalarType::Bits8), "Bits8"); - - EXPECT_EQ(c10::elementSize(c10::ScalarType::QUInt8), static_cast(1)); - EXPECT_EQ(c10::elementSize(c10::ScalarType::QUInt2x4), - static_cast(1)); - EXPECT_EQ(c10::elementSize(c10::ScalarType::Bits2x4), static_cast(1)); - EXPECT_EQ(c10::elementSize(c10::ScalarType::Bits4x2), static_cast(1)); - EXPECT_EQ(c10::elementSize(c10::ScalarType::Bits8), static_cast(1)); - - EXPECT_TRUE(c10::isIntegralType(c10::ScalarType::UInt64, false)); - EXPECT_FALSE(c10::isIntegralType(c10::ScalarType::Float, true)); - EXPECT_TRUE(c10::isFloat8Type(c10::ScalarType::Float8_e4m3fn)); - EXPECT_TRUE(c10::isReducedFloatingType(c10::ScalarType::Half)); - EXPECT_FALSE(c10::isReducedFloatingType(c10::ScalarType::Float)); - EXPECT_TRUE(c10::isFloatingType(c10::ScalarType::Half)); - EXPECT_TRUE(c10::isComplexType(c10::ScalarType::ComplexFloat)); - - EXPECT_TRUE(c10::isSignedType(c10::ScalarType::QInt8)); - EXPECT_TRUE(c10::isSignedType(c10::ScalarType::ComplexHalf)); - EXPECT_FALSE(c10::isSignedType(c10::ScalarType::Byte)); - EXPECT_FALSE(c10::isSignedType(c10::ScalarType::Bool)); - EXPECT_THROW(c10::isSignedType(c10::ScalarType::NumOptions), - ::std::exception); -} diff --git a/test/cpp/compat/c10_layout_test.cc b/test/cpp/compat/c10_layout_test.cc index 6e163aa8ee4c5a..1a0babeb7d3673 100644 --- a/test/cpp/compat/c10_layout_test.cc +++ b/test/cpp/compat/c10_layout_test.cc @@ -368,6 +368,9 @@ TEST(SparseConstructorTest, SparseCooTensorInferSize) { ASSERT_TRUE(sparse.is_sparse()); ASSERT_EQ(sparse.layout(), c10::kSparse); + ASSERT_EQ(sparse.dim(), 2); + ASSERT_EQ(sparse.size(0), 3); + ASSERT_EQ(sparse.size(1), 3); } TEST(SparseConstructorTest, SparseCooTensorDouble) { diff --git a/test/cpp/compat/c10_storage_test.cc b/test/cpp/compat/c10_storage_test.cc index f05083fe88747f..947786fb929870 100644 --- a/test/cpp/compat/c10_storage_test.cc +++ b/test/cpp/compat/c10_storage_test.cc @@ -26,8 +26,9 @@ #include "paddle/phi/backends/gpu/gpu_info.h" // Forward-declare getCUDADeviceAllocator to avoid include-order conflicts -// between ATen/cuda/CUDAContextLight.h (defines at::cuda::is_available inline) -// and torch/cuda.h (adds `using torch::cuda::is_available` to at::cuda). +// between ATen/cuda/CUDAContextLight.h (defines torch::cuda::is_available +// inline) and torch/cuda.h (adds `using torch::cuda::is_available` to +// at::cuda). namespace at::cuda { c10::Allocator* getCUDADeviceAllocator(); } // namespace at::cuda @@ -242,7 +243,7 @@ TEST(StorageTest, DeviceAndDeviceTypeAPIs) { ASSERT_EQ(place.GetType(), phi::AllocationType::CPU); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (at::cuda::is_available()) { + if (torch::cuda::is_available()) { at::TensorBase cuda_tensor = at::ones( {2, 3}, c10::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); const c10::Storage& cuda_storage = cuda_tensor.storage(); @@ -731,9 +732,13 @@ TEST(StorageTest, DataPtrHelpersAndAllocatorSimpleDataPtrChecks) { dp.unsafe_set_device(c10::Device(c10::DeviceType::CPU)); ASSERT_EQ(dp.device().type(), c10::DeviceType::CPU); - // is_simple_data_ptr: context==nullptr branch. + // PyTorch only treats context==data as a simple DataPtr. RawCompatibleAllocator compatible_alloc; - ASSERT_TRUE(compatible_alloc.is_simple_data_ptr(dp)); + ASSERT_FALSE(compatible_alloc.is_simple_data_ptr(dp)); + + // is_simple_data_ptr: context==data branch. + c10::DataPtr simple = compatible_alloc.allocate(4); + ASSERT_TRUE(compatible_alloc.is_simple_data_ptr(simple)); // is_simple_data_ptr: context!=data branch. c10::DataPtr non_simple = RawIncompatibleAllocator().allocate(4); @@ -1041,7 +1046,7 @@ TEST(StorageTest, ReferenceSemanticsSetNbytesVisibleThroughCopy) { TEST(StorageTest, CUDAAllocatorZeroBytePreservesDevice) { // getCUDADeviceAllocator()->allocate(0) must return a DataPtr whose device // is the current CUDA device, not a default-constructed CPU DataPtr. - if (!at::cuda::is_available()) { + if (!torch::cuda::is_available()) { return; // No CUDA device, skip } diff --git a/test/cpp/compat/compat_basic_test.cc b/test/cpp/compat/compat_basic_test.cc index d6f33869449003..232a9fd66e8f7c 100644 --- a/test/cpp/compat/compat_basic_test.cc +++ b/test/cpp/compat/compat_basic_test.cc @@ -231,7 +231,7 @@ TEST(compat_basic_test, BasicCase) { TEST(TestDevice, DeviceAPIsOnCUDA) { // Test device related APIs on CUDA if available #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (at::cuda::is_available()) { + if (torch::cuda::is_available()) { at::TensorBase cuda_tensor = at::ones( {2, 3}, c10::TensorOptions().dtype(at::kFloat).device(at::kCUDA));