diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ed9491d0ecc..01f627a789dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Replaced `ci` section in `.pre-commit-config.yaml` with a new GitHub workflow with scheduled run to autoupdate the `pre-commit` configuration [#2542](https://github.com/IntelPython/dpnp/pull/2542) * FFT module is updated to perform in-place FFT in intermediate steps of ND FFT [#2543](https://github.com/IntelPython/dpnp/pull/2543) * Reused dpctl tensor include to enable experimental SYCL namespace for complex types [#2546](https://github.com/IntelPython/dpnp/pull/2546) +* Improved performance of `dpnp.isclose` function by implementing a dedicated kernel for scalar `rtol` and `atol` arguments [#2540](https://github.com/IntelPython/dpnp/pull/2540) ### Deprecated diff --git a/dpnp/backend/extensions/common/ext/details/validation_utils_internal.hpp b/dpnp/backend/extensions/common/ext/details/validation_utils_internal.hpp index 2ff800aced59..816ea0453ade 100644 --- a/dpnp/backend/extensions/common/ext/details/validation_utils_internal.hpp +++ b/dpnp/backend/extensions/common/ext/details/validation_utils_internal.hpp @@ -114,8 +114,10 @@ inline void check_no_overlap(const array_ptr &input, } const auto &overlap = dpctl::tensor::overlap::MemoryOverlap(); + const auto &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); - if (overlap(*input, *output)) { + if (overlap(*input, *output) && !same_logical_tensors(*input, *output)) { throw py::value_error(name_of(input, names) + " has overlapping memory segments with " + name_of(output, names)); diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt index 65d30d49f549..dfd58236ad69 100644 --- a/dpnp/backend/extensions/ufunc/CMakeLists.txt +++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt @@ -37,6 +37,7 @@ set(_elementwise_sources ${CMAKE_CURRENT_SOURCE_DIR}/elementwise_functions/heaviside.cpp ${CMAKE_CURRENT_SOURCE_DIR}/elementwise_functions/i0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/elementwise_functions/interpolate.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/elementwise_functions/isclose.cpp ${CMAKE_CURRENT_SOURCE_DIR}/elementwise_functions/lcm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/elementwise_functions/ldexp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/elementwise_functions/logaddexp2.cpp diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/common.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/common.cpp index c6dd3e038eb1..9d30f16273f0 100644 --- a/dpnp/backend/extensions/ufunc/elementwise_functions/common.cpp +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/common.cpp @@ -37,6 +37,7 @@ #include "heaviside.hpp" #include "i0.hpp" #include "interpolate.hpp" +#include "isclose.hpp" #include "lcm.hpp" #include "ldexp.hpp" #include "logaddexp2.hpp" @@ -66,6 +67,7 @@ void init_elementwise_functions(py::module_ m) init_heaviside(m); init_i0(m); init_interpolate(m); + init_isclose(m); init_lcm(m); init_ldexp(m); init_logaddexp2(m); diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp new file mode 100644 index 000000000000..bd6053c9b7ab --- /dev/null +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp @@ -0,0 +1,408 @@ +//***************************************************************************** +// Copyright (c) 2025, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include +#include +#include +#include +#include + +#include + +#include "dpctl4pybind11.hpp" +#include +#include + +#include "kernels/elementwise_functions/isclose.hpp" + +#include "../../elementwise_functions/simplify_iteration_space.hpp" + +// dpctl tensor headers +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "ext/common.hpp" +#include "ext/validation_utils.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using ext::common::value_type_of_t; +using ext::validation::array_names; + +using ext::common::dtype_from_typenum; +using ext::validation::check_has_dtype; +using ext::validation::check_no_overlap; +using ext::validation::check_num_dims; +using ext::validation::check_queue; +using ext::validation::check_same_dtype; +using ext::validation::check_same_size; +using ext::validation::check_writable; + +namespace dpnp::extensions::ufunc +{ + +namespace impl +{ + +typedef sycl::event (*isclose_strided_scalar_fn_ptr_t)( + sycl::queue &, + const int, // nd + const std::size_t, // nelems + const py::ssize_t *, // shape_strides + const py::object &, // rtol + const py::object &, // atol + const py::object &, // equal_nan + const char *, // a + py::ssize_t, // a_offset + const char *, // b + py::ssize_t, // b_offset + char *, // out + py::ssize_t, // out_offset + const std::vector &); + +template +sycl::event isclose_strided_scalar_call(sycl::queue &exec_q, + const int nd, + const std::size_t nelems, + const py::ssize_t *shape_strides, + const py::object &py_rtol, + const py::object &py_atol, + const py::object &py_equal_nan, + const char *in1_p, + py::ssize_t in1_offset, + const char *in2_p, + py::ssize_t in2_offset, + char *out_p, + py::ssize_t out_offset, + const std::vector &depends) +{ + using dpctl::tensor::type_utils::is_complex_v; + using scT = std::conditional_t, value_type_of_t, T>; + + const scT rtol = py::cast(py_rtol); + const scT atol = py::cast(py_atol); + const bool equal_nan = py::cast(py_equal_nan); + + return dpnp::kernels::isclose::isclose_strided_scalar_impl( + exec_q, nd, nelems, shape_strides, rtol, atol, equal_nan, in1_p, + in1_offset, in2_p, in2_offset, out_p, out_offset, depends); +} + +typedef sycl::event (*isclose_contig_scalar_fn_ptr_t)( + sycl::queue &, + const std::size_t, // nelems + const py::object &, // rtol + const py::object &, // atol + const py::object &, // equal_nan + const char *, // a + const char *, // b + char *, // out + const std::vector &); + +template +sycl::event isclose_contig_scalar_call(sycl::queue &q, + const std::size_t nelems, + const py::object &py_rtol, + const py::object &py_atol, + const py::object &py_equal_nan, + const char *in1_p, + const char *in2_p, + char *out_p, + const std::vector &depends) +{ + using dpctl::tensor::type_utils::is_complex_v; + using scT = std::conditional_t, value_type_of_t, T>; + + const scT rtol = py::cast(py_rtol); + const scT atol = py::cast(py_atol); + const bool equal_nan = py::cast(py_equal_nan); + + return dpnp::kernels::isclose::isclose_contig_scalar_impl( + q, nelems, rtol, atol, equal_nan, in1_p, in2_p, out_p, depends); +} + +isclose_strided_scalar_fn_ptr_t + isclose_strided_scalar_dispatch_vector[td_ns::num_types]; +isclose_contig_scalar_fn_ptr_t isclose_contig_dispatch_vector[td_ns::num_types]; + +std::pair + py_isclose_scalar(const dpctl::tensor::usm_ndarray &a, + const dpctl::tensor::usm_ndarray &b, + const py::object &py_rtol, + const py::object &py_atol, + const py::object &py_equal_nan, + const dpctl::tensor::usm_ndarray &res, + sycl::queue &exec_q, + const std::vector &depends) +{ + array_names names = {{&a, "a"}, {&b, "b"}, {&res, "res"}}; + + check_same_dtype(&a, &b, names); + check_has_dtype(&res, td_ns::typenum_t::BOOL, names); + + check_same_size({&a, &b, &res}, names); + int res_nd = res.get_ndim(); + check_num_dims({&a, &b}, res_nd, names); + + check_queue({&a, &b, &res}, names, exec_q); + check_no_overlap({&a, &b}, {&res}, names); + check_writable({&res}, names); + + auto types = td_ns::usm_ndarray_types(); + // a_typeid == b_typeid (check_same_dtype(&a, &b, names)) + int a_b_typeid = types.typenum_to_lookup_id(a.get_typenum()); + + const py::ssize_t *a_shape = a.get_shape_raw(); + const py::ssize_t *b_shape = b.get_shape_raw(); + const py::ssize_t *res_shape = res.get_shape_raw(); + bool shapes_equal(true); + std::size_t nelems(1); + + for (int i = 0; i < res_nd; ++i) { + nelems *= static_cast(a_shape[i]); + shapes_equal = shapes_equal && (a_shape[i] == res_shape[i] && + b_shape[i] == res_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same"); + } + + // if nelems is zero, return + if (nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(res, nelems); + + const char *a_data = a.get_data(); + const char *b_data = b.get_data(); + char *res_data = res.get_data(); + + // handle contiguous inputs + const bool is_a_c_contig = a.is_c_contiguous(); + const bool is_a_f_contig = a.is_f_contiguous(); + + const bool is_b_c_contig = b.is_c_contiguous(); + const bool is_b_f_contig = b.is_f_contiguous(); + + const bool is_res_c_contig = res.is_c_contiguous(); + const bool is_res_f_contig = res.is_f_contiguous(); + + const bool all_c_contig = + (is_a_c_contig && is_b_c_contig && is_res_c_contig); + const bool all_f_contig = + (is_a_f_contig && is_b_f_contig && is_res_f_contig); + + if (all_c_contig || all_f_contig) { + auto contig_fn = isclose_contig_dispatch_vector[a_b_typeid]; + + if (contig_fn == nullptr) { + py::dtype a_b_dtype_py = dtype_from_typenum(a_b_typeid); + throw std::runtime_error( + "Contiguous implementation is missing for " + + std::string(py::str(a_b_dtype_py)) + "data type"); + } + + auto comp_ev = contig_fn(exec_q, nelems, py_rtol, py_atol, py_equal_nan, + a_data, b_data, res_data, depends); + sycl::event ht_ev = + dpctl::utils::keep_args_alive(exec_q, {a, b, res}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + + // simplify iteration space + // if 1d with strides 1 - input is contig + // dispatch to strided + + auto const &a_strides = a.get_strides_vector(); + auto const &b_strides = b.get_strides_vector(); + auto const &res_strides = res.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_a_strides; + shT simplified_b_strides; + shT simplified_res_strides; + py::ssize_t a_offset(0); + py::ssize_t b_offset(0); + py::ssize_t res_offset(0); + + int nd = res_nd; + const py::ssize_t *shape = a_shape; + + py_internal::simplify_iteration_space_3( + nd, shape, a_strides, b_strides, res_strides, + // output + simplified_shape, simplified_a_strides, simplified_b_strides, + simplified_res_strides, a_offset, b_offset, res_offset); + + if (nd == 1 && simplified_a_strides[0] == 1 && + simplified_b_strides[0] == 1 && simplified_res_strides[0] == 1) + { + // Special case of contiguous data + auto contig_fn = isclose_contig_dispatch_vector[a_b_typeid]; + + if (contig_fn == nullptr) { + py::dtype a_b_dtype_py = dtype_from_typenum(a_b_typeid); + throw std::runtime_error( + "Contiguous implementation is missing for " + + std::string(py::str(a_b_dtype_py)) + "data type"); + } + + int a_elem_size = a.get_elemsize(); + int b_elem_size = b.get_elemsize(); + int res_elem_size = res.get_elemsize(); + auto comp_ev = contig_fn( + exec_q, nelems, py_rtol, py_atol, py_equal_nan, + a_data + a_elem_size * a_offset, b_data + b_elem_size * b_offset, + res_data + res_elem_size * res_offset, depends); + + sycl::event ht_ev = + dpctl::utils::keep_args_alive(exec_q, {a, b, res}, {comp_ev}); + + return std::make_pair(ht_ev, comp_ev); + } + + auto strided_fn = isclose_strided_scalar_dispatch_vector[a_b_typeid]; + + if (strided_fn == nullptr) { + py::dtype a_b_dtype_py = dtype_from_typenum(a_b_typeid); + throw std::runtime_error("Strided implementation is missing for " + + std::string(py::str(a_b_dtype_py)) + + "data type"); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + std::vector host_tasks{}; + host_tasks.reserve(2); + + auto ptr_size_event_triple_ = device_allocate_and_pack( + exec_q, host_tasks, simplified_shape, simplified_a_strides, + simplified_b_strides, simplified_res_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_triple_)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_triple_); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_ev); + + sycl::event comp_ev = strided_fn( + exec_q, nd, nelems, shape_strides, py_rtol, py_atol, py_equal_nan, + a_data, a_offset, b_data, b_offset, res_data, res_offset, all_deps); + + // async free of shape_strides temporary + sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, shape_strides_owner); + + host_tasks.push_back(tmp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {a, b, res}, host_tasks), + comp_ev); +} + +/** + * @brief A factory to define pairs of supported types for which + * isclose function is available. + * + * @tparam T Type of input vector `a` and `b` and of result vector `y`. + */ +template +struct IsCloseOutputType +{ + using value_type = typename std::disjunction< + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +struct IsCloseStridedScalarFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + return nullptr; + } + else { + return isclose_strided_scalar_call; + } + } +}; + +template +struct IsCloseContigScalarFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + return nullptr; + } + else { + return isclose_contig_scalar_call; + } + } +}; + +void populate_isclose_dispatch_vectors() +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isclose_strided_scalar_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isclose_contig_dispatch_vector); +} + +} // namespace impl + +void init_isclose(py::module_ m) +{ + impl::populate_isclose_dispatch_vectors(); + + m.def("_isclose_scalar", &impl::py_isclose_scalar, "", py::arg("a"), + py::arg("b"), py::arg("py_rtol"), py::arg("py_atol"), + py::arg("py_equal_nan"), py::arg("res"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); +} + +} // namespace dpnp::extensions::ufunc diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.hpp b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.hpp new file mode 100644 index 000000000000..5216db4f3a29 --- /dev/null +++ b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.hpp @@ -0,0 +1,35 @@ +//***************************************************************************** +// Copyright (c) 2025, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpnp::extensions::ufunc +{ +void init_isclose(py::module_ m); +} // namespace dpnp::extensions::ufunc diff --git a/dpnp/backend/kernels/elementwise_functions/isclose.hpp b/dpnp/backend/kernels/elementwise_functions/isclose.hpp new file mode 100644 index 000000000000..b2a0db782f5f --- /dev/null +++ b/dpnp/backend/kernels/elementwise_functions/isclose.hpp @@ -0,0 +1,337 @@ +//***************************************************************************** +// Copyright (c) 2025, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include +#include +#include + +#include +// dpctl tensor headers +#include "kernels/alignment.hpp" +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/elementwise_functions/sycl_complex.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpnp::kernels::isclose +{ + +template +inline bool isclose(const T a, + const T b, + const T rtol, + const T atol, + const bool equal_nan) +{ + static_assert(std::is_floating_point_v || std::is_same_v); + + if (sycl::isfinite(a) && sycl::isfinite(b)) { + return sycl::fabs(a - b) <= atol + rtol * sycl::fabs(b); + } + + if (sycl::isnan(a) && sycl::isnan(b)) { + return equal_nan; + } + + return a == b; +} + +template +inline bool isclose(const std::complex a, + const std::complex b, + const T rtol, + const T atol, + const bool equal_nan) +{ + const bool a_finite = sycl::isfinite(a.real()) && sycl::isfinite(a.imag()); + const bool b_finite = sycl::isfinite(b.real()) && sycl::isfinite(b.imag()); + + if (a_finite && b_finite) { + return exprm_ns::abs(exprm_ns::complex(a - b)) <= + atol + rtol * exprm_ns::abs(exprm_ns::complex(b)); + } + + if (sycl::isnan(a.real()) && sycl::isnan(a.imag()) && + sycl::isnan(b.real()) && sycl::isnan(b.imag())) + { + return equal_nan; + } + + return a == b; +} + +template +struct IsCloseStridedScalarFunctor +{ +private: + const T *a_ = nullptr; + const T *b_ = nullptr; + resTy *out_ = nullptr; + const ThreeOffsets_IndexerT three_offsets_indexer_; + const scT rtol_; + const scT atol_; + const bool equal_nan_; + +public: + IsCloseStridedScalarFunctor(const T *a, + const T *b, + resTy *out, + const ThreeOffsets_IndexerT &inps_res_indexer, + const scT rtol, + const scT atol, + const bool equal_nan) + : a_(a), b_(b), out_(out), three_offsets_indexer_(inps_res_indexer), + rtol_(rtol), atol_(atol), equal_nan_(equal_nan) + { + } + + void operator()(sycl::id<1> wid) const + { + const auto &three_offsets_ = three_offsets_indexer_(wid.get(0)); + const dpctl::tensor::ssize_t &inp1_offset = + three_offsets_.get_first_offset(); + const dpctl::tensor::ssize_t &inp2_offset = + three_offsets_.get_second_offset(); + const dpctl::tensor::ssize_t &out_offset = + three_offsets_.get_third_offset(); + + out_[out_offset] = + isclose(a_[inp1_offset], b_[inp2_offset], rtol_, atol_, equal_nan_); + } +}; + +template +struct IsCloseContigScalarFunctor +{ +private: + const T *a_ = nullptr; + const T *b_ = nullptr; + resTy *out_ = nullptr; + std::size_t nelems_; + const scT rtol_; + const scT atol_; + const bool equal_nan_; + +public: + IsCloseContigScalarFunctor(const T *a, + const T *b, + resTy *out, + const std::size_t n_elems, + const scT rtol, + const scT atol, + const bool equal_nan) + : a_(a), b_(b), out_(out), nelems_(n_elems), rtol_(rtol), atol_(atol), + equal_nan_(equal_nan) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + /* Each work-item processes vec_sz elements, contiguous in memory */ + /* NOTE: work-group size must be divisible by sub-group size */ + + using dpctl::tensor::type_utils::is_complex_v; + if constexpr (enable_sg_loadstore && !is_complex_v) { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems_) { + using dpctl::tensor::sycl_utils::sub_group_load; + using dpctl::tensor::sycl_utils::sub_group_store; +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto a_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&a_[offset]); + auto b_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&b_[offset]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out_[offset]); + + const sycl::vec a_vec = + sub_group_load(sg, a_multi_ptr); + const sycl::vec b_vec = + sub_group_load(sg, b_multi_ptr); + + sycl::vec res_vec; +#pragma unroll + for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + res_vec[vec_id] = isclose(a_vec[vec_id], b_vec[vec_id], + rtol_, atol_, equal_nan_); + } + sub_group_store(sg, res_vec, out_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) { + out_[k] = isclose(a_[k], b_[k], rtol_, atol_, equal_nan_); + } + } + } + else { + const std::uint16_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems_, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + out_[offset] = + isclose(a_[offset], b_[offset], rtol_, atol_, equal_nan_); + } + } + } +}; + +template +sycl::event + isclose_strided_scalar_impl(sycl::queue &exec_q, + const int nd, + std::size_t nelems, + const dpctl::tensor::ssize_t *shape_strides, + const scT rtol, + const scT atol, + const bool equal_nan, + const char *a_cp, + const dpctl::tensor::ssize_t a_offset, + const char *b_cp, + const dpctl::tensor::ssize_t b_offset, + char *out_cp, + const dpctl::tensor::ssize_t out_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *a_tp = reinterpret_cast(a_cp); + const T *b_tp = reinterpret_cast(b_cp); + + using resTy = bool; + resTy *out_tp = reinterpret_cast(out_cp); + + using IndexerT = + typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer; + const IndexerT indexer{nd, a_offset, b_offset, out_offset, shape_strides}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using IsCloseFunc = + IsCloseStridedScalarFunctor; + cgh.parallel_for( + {nelems}, + IsCloseFunc(a_tp, b_tp, out_tp, indexer, rtol, atol, equal_nan)); + }); + return comp_ev; +} + +template +sycl::event + isclose_contig_scalar_impl(sycl::queue &exec_q, + std::size_t nelems, + const scT rtol, + const scT atol, + const bool equal_nan, + const char *a_cp, + const char *b_cp, + char *out_cp, + const std::vector &depends = {}) +{ + constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + const std::size_t n_work_items_needed = nelems / elems_per_wi; + const std::size_t empirical_threshold = std::size_t(1) << 21; + const std::size_t lws = (n_work_items_needed <= empirical_threshold) + ? std::size_t(128) + : std::size_t(256); + + const std::size_t n_groups = + ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + const T *a_tp = reinterpret_cast(a_cp); + const T *b_tp = reinterpret_cast(b_cp); + + using resTy = bool; + resTy *out_tp = reinterpret_cast(out_cp); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using dpctl::tensor::kernels::alignment_utils::is_aligned; + using dpctl::tensor::kernels::alignment_utils::required_alignment; + if (is_aligned(a_tp) && + is_aligned(b_tp) && + is_aligned(out_tp)) + { + constexpr bool enable_sg_loadstore = true; + using IsCloseFunc = + IsCloseContigScalarFunctor; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + IsCloseFunc(a_tp, b_tp, out_tp, nelems, rtol, atol, equal_nan)); + } + else { + constexpr bool disable_sg_loadstore = false; + using IsCloseFunc = + IsCloseContigScalarFunctor; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + IsCloseFunc(a_tp, b_tp, out_tp, nelems, rtol, atol, equal_nan)); + } + }); + + return comp_ev; +} + +} // namespace dpnp::kernels::isclose diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py index 84258f698b8f..bc4a79af20d5 100644 --- a/dpnp/dpnp_iface_logic.py +++ b/dpnp/dpnp_iface_logic.py @@ -44,9 +44,11 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_elementwise_impl as ti +import dpctl.utils as dpu import numpy import dpnp +import dpnp.backend.extensions.ufunc._ufunc_impl as ufi from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc from .dpnp_utils import get_usm_allocations @@ -82,6 +84,74 @@ ] +def _isclose_scalar_tol(a, b, rtol, atol, equal_nan): + """ + Specialized implementation of dpnp.isclose() for scalar rtol and atol + using a dedicated SYCL kernel. + """ + dt = dpnp.result_type(a, b, 1.0) + + if dpnp.isscalar(a): + usm_type = b.usm_type + exec_q = b.sycl_queue + a = dpnp.array( + a, + dt, + usm_type=usm_type, + sycl_queue=exec_q, + ) + elif dpnp.isscalar(b): + usm_type = a.usm_type + exec_q = a.sycl_queue + b = dpnp.array( + b, + dt, + usm_type=usm_type, + sycl_queue=exec_q, + ) + else: + usm_type, exec_q = get_usm_allocations([a, b]) + + a = dpnp.astype(a, dt, casting="same_kind", copy=False) + b = dpnp.astype(b, dt, casting="same_kind", copy=False) + + # Convert complex rtol/atol to to their real parts + # to avoid pybind11 cast errors and match NumPy behavior + if isinstance(rtol, complex): + rtol = rtol.real + if isinstance(atol, complex): + atol = atol.real + + # pylint: disable=W0707 + try: + a, b = dpnp.broadcast_arrays(a, b) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{a.shape} and {b.shape}" + ) + + out_dtype = dpnp.bool + output = dpnp.empty( + a.shape, dtype=out_dtype, sycl_queue=exec_q, usm_type=usm_type + ) + + _manager = dpu.SequentialOrderManager[exec_q] + mem_ev, ht_ev = ufi._isclose_scalar( + a.get_array(), + b.get_array(), + rtol, + atol, + equal_nan, + output.get_array(), + exec_q, + depends=_manager.submitted_events, + ) + _manager.add_event_pair(mem_ev, ht_ev) + + return output + + def all(a, /, axis=None, out=None, keepdims=False, *, where=True): """ Test whether all array elements along a given axis evaluate to ``True``. @@ -870,6 +940,10 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): rtol, atol, scalar_type=True, all_scalars=True ) + # Use own SYCL kernel for scalar rtol/atol + if dpnp.isscalar(rtol) and dpnp.isscalar(atol): + return _isclose_scalar_tol(a, b, rtol, atol, equal_nan) + # make sure b is an inexact type to avoid bad behavior on abs(MIN_INT) if dpnp.isscalar(b): dt = dpnp.result_type(a, b, 1.0, rtol, atol) diff --git a/dpnp/tests/test_logic.py b/dpnp/tests/test_logic.py index 4281279da450..bf003410849b 100644 --- a/dpnp/tests/test_logic.py +++ b/dpnp/tests/test_logic.py @@ -10,11 +10,14 @@ import dpnp from .helper import ( + generate_random_numpy_array, get_all_dtypes, + get_complex_dtypes, get_float_complex_dtypes, get_float_dtypes, get_integer_float_dtypes, ) +from .third_party.cupy import testing class TestAllAny: @@ -515,23 +518,177 @@ def test_infinity_sign_errors(func): getattr(dpnp, func)(x, out=out) -@pytest.mark.parametrize("dtype", get_integer_float_dtypes()) -@pytest.mark.parametrize( - "rtol", [1e-05, dpnp.array(1e-05), dpnp.full(10, 1e-05)] -) -@pytest.mark.parametrize( - "atol", [1e-08, dpnp.array(1e-08), dpnp.full(10, 1e-08)] -) -def test_isclose(dtype, rtol, atol): - a = numpy.random.rand(10) - b = a + numpy.random.rand(10) * 1e-8 +class TestIsClose: + @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True)) + @pytest.mark.parametrize( + "rtol", [1e-5, dpnp.array(1e-5), dpnp.full((10,), 1e-5)] + ) + @pytest.mark.parametrize( + "atol", [1e-8, dpnp.array(1e-8), dpnp.full((10,), 1e-8)] + ) + def test_isclose(self, dtype, rtol, atol): + a = numpy.random.rand(10) + b = a + numpy.random.rand(10) * 1e-8 - dpnp_a = dpnp.array(a, dtype=dtype) - dpnp_b = dpnp.array(b, dtype=dtype) + dpnp_a = dpnp.array(a, dtype=dtype) + dpnp_b = dpnp.array(b, dtype=dtype) - np_res = numpy.isclose(a, b, 1e-05, 1e-08) - dpnp_res = dpnp.isclose(dpnp_a, dpnp_b, rtol, atol) - assert_allclose(dpnp_res, np_res) + np_res = numpy.isclose(a, b, rtol=1e-5, atol=1e-8) + dpnp_res = dpnp.isclose(dpnp_a, dpnp_b, rtol=rtol, atol=atol) + assert_allclose(dpnp_res, np_res) + + @pytest.mark.parametrize("dtype", get_complex_dtypes()) + @pytest.mark.parametrize("shape", [(4, 4), (16, 16), (4, 4, 4)]) + def test_isclose_complex(self, dtype, shape): + a = generate_random_numpy_array(shape, dtype=dtype, seed_value=81) + b = a.copy() + + b = b + (1e-6 + 1e-6j) + + dpnp_a = dpnp.array(a, dtype=dtype) + dpnp_b = dpnp.array(b, dtype=dtype) + + np_res = numpy.isclose(a, b) + dpnp_res = dpnp.isclose(dpnp_a, dpnp_b) + assert_allclose(dpnp_res, np_res) + + @pytest.mark.parametrize( + "rtol, atol", + [ + (1e-5, 1e-8), + (dpnp.array(1e-5), dpnp.array(1e-8)), + ], + ) + def test_empty_input(self, rtol, atol): + a = numpy.array([]) + b = numpy.array([]) + + dpnp_a = dpnp.array(a) + dpnp_b = dpnp.array(b) + + np_res = numpy.isclose(a, b, rtol=1e-5, atol=1e-8) + dpnp_res = dpnp.isclose(dpnp_a, dpnp_b, rtol=rtol, atol=atol) + assert_allclose(dpnp_res, np_res) + + @pytest.mark.parametrize( + "rtol, atol", + [ + (1e-5, 1e-8), + (dpnp.array(1e-5), dpnp.array(1e-8)), + ], + ) + @pytest.mark.parametrize("val", [1.0, numpy.inf, -numpy.inf, numpy.nan]) + def test_input_0d(self, val, rtol, atol): + dp_arr = dpnp.array(val) + np_arr = numpy.array(val) + + # array & scalar + dp_res = dpnp.isclose(dp_arr, val, rtol=rtol, atol=atol) + np_res = numpy.isclose(np_arr, val, rtol=1e-5, atol=1e-8) + assert_allclose(dp_res, np_res) + + # scalar & array + dp_res = dpnp.isclose(val, dp_arr, rtol=rtol, atol=atol) + np_res = numpy.isclose(val, np_arr, rtol=1e-5, atol=1e-8) + assert_allclose(dp_res, np_res) + + # array & array + dp_res = dpnp.isclose(dp_arr, dp_arr, rtol=rtol, atol=atol) + np_res = numpy.isclose(np_arr, np_arr, rtol=1e-5, atol=1e-8) + assert_allclose(dp_res, np_res) + + @pytest.mark.parametrize( + "sh_a, sh_b", + [ + ((10,), (1,)), + ((3, 1, 5), (3, 5)), + ((3, 1, 5), (1, 3, 5)), + ((1, 10), (10,)), + ], + ) + def test_broadcast_shapes(self, sh_a, sh_b): + a_np = numpy.ones(sh_a) + b_np = numpy.ones(sh_b) + + a_dp = dpnp.ones(sh_a) + b_dp = dpnp.ones(sh_b) + + np_res = numpy.isclose(a_np, b_np) + dp_res = dpnp.isclose(a_dp, b_dp) + assert_allclose(dp_res, np_res) + + @pytest.mark.parametrize( + "rtol, atol", + [ + (1e-5, 1e-8), + (dpnp.array(1e-5), dpnp.array(1e-8)), + ], + ) + def test_equal_nan(self, rtol, atol): + a = numpy.array([numpy.nan, 1.0]) + b = numpy.array([numpy.nan, 1.0]) + + dp_a = dpnp.array(a) + dp_b = dpnp.array(b) + + np_res = numpy.isclose(a, b, rtol=1e-5, atol=1e-8, equal_nan=True) + dp_res = dpnp.isclose(dp_a, dp_b, rtol=rtol, atol=atol, equal_nan=True) + assert_allclose(dp_res, np_res) + + # array-like rtol/atol support requires NumPy >= 2.0 + @testing.with_requires("numpy>=2.0") + def test_rtol_atol_arrays(self): + a = numpy.array([2.1, 2.1, 2.1, 2.1, 5, numpy.nan]) + b = numpy.array([2, 2, 2, 2, numpy.nan, 5]) + atol = numpy.array([0.11, 0.09, 1e-8, 1e-8, 1, 1]) + rtol = numpy.array([1e-8, 1e-8, 0.06, 0.04, 1, 1]) + + dp_a = dpnp.array(a) + dp_b = dpnp.array(b) + dp_rtol = dpnp.array(rtol) + dp_atol = dpnp.array(atol) + + np_res = numpy.isclose(a, b, rtol=rtol, atol=atol) + dp_res = dpnp.isclose(dp_a, dp_b, rtol=dp_rtol, atol=dp_atol) + assert_allclose(dp_res, np_res) + + @pytest.mark.parametrize( + "rtol, atol", + [ + (0 + 1e-5j, 1e-08), + (1e-05, 0 + 1e-8j), + (0 + 1e-5j, 0 + 1e-8j), + ], + ) + def test_rtol_atol_complex(self, rtol, atol): + a = dpnp.array([1.0, 1.0]) + b = dpnp.array([1.0, 1.0 + 1e-6]) + + dpnp_res = dpnp.isclose(a, b, rtol=rtol, atol=atol) + np_res = numpy.isclose(a.asnumpy(), b.asnumpy(), rtol=rtol, atol=atol) + assert_allclose(dpnp_res, np_res) + + # NEP 50: float32 vs Python float comparison requires NumPy >= 2.0 + @testing.with_requires("numpy>=2.0") + def test_rtol_atol_nep50(self): + below_one = float(1.0 - numpy.finfo("f8").eps) + f32 = numpy.array(below_one, dtype="f4") + dp_f32 = dpnp.array(f32) + + assert_allclose( + dpnp.isclose(dp_f32, below_one, rtol=0, atol=0), + numpy.isclose(f32, below_one, rtol=0, atol=0), + ) + + def test_invalid_input(self): + # unsupported type + assert_raises(TypeError, dpnp.isclose, 1.0, 1.0) + assert_raises(TypeError, dpnp.isclose, [1.0], [1.0]) + + # broadcast error + assert_raises( + ValueError, dpnp.isclose, dpnp.ones((10,)), dpnp.ones((3, 5)) + ) @pytest.mark.parametrize("a", [numpy.array([1, 2]), numpy.array([1, 1])])