From a669603268640f30c1cadb24dc68396838e26bfa Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 13 Nov 2025 15:20:07 -0800 Subject: [PATCH 1/9] Use vectorcall for all-positional-argument calls If a handle or object is called with only positional arguments, it is straightforward to use PyObject_Vectorcall instead of PyObject_CallObject. Benchmarked by adding a trivial function to pybind11_benchmark: ``` m.def("call_func_with_int", [](py::object func) { return func(py::cast(1)); }); ``` and then running `python -m timeit --setup 'from pybind11_benchmark import call_func_with_int; f = lambda x: x + 1' 'call_func_with_int(f)'`. Before on M4 mac: 57.6 nsec per loop After on M4 mac: 48.4 nsec per loop For comparison, the included collatz benchmark takes 33.1 nsec per loop, just calling `f(1)` directly takes 17.8 nec per loop, and simply running `pass` takes 4.19 nsec per loop. --- include/pybind11/cast.h | 49 ++++++++++++++++++++++++++++++-------- include/pybind11/pytypes.h | 2 +- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 556bdb7e34..8eba512b43 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2171,21 +2171,50 @@ class argument_loader { /// Helper class which collects only positional arguments for a Python function call. /// A fancier version below can collect any argument, but this one is optimal for simple calls. -template +template class simple_collector { public: template - explicit simple_collector(Ts &&...values) - : m_args(pybind11::make_tuple(std::forward(values)...)) {} + explicit simple_collector(Ts &&...values) { + static_assert(sizeof...(Ts) == N); + size_t i = 0; + using expander = int[]; + (void) expander{ + 0, + (m_args[i++] = detail::make_caster::cast(std::forward(values), policy, nullptr) + .inc_ref() + .ptr(), + 0)...}; + for (i = 0; i < N; ++i) { + if (!m_args[i]) { +#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES) + throw cast_error_unable_to_convert_call_arg(std::to_string(i)); +#else + std::array argtypes{{type_id()...}}; + throw cast_error_unable_to_convert_call_arg(std::to_string(i), argtypes[i]); +#endif + } + } + } - const tuple &args() const & { return m_args; } - dict kwargs() const { return {}; } + ~simple_collector() { + for (size_t i = 0; i < N; ++i) { + handle(m_args[i]).dec_ref(); + } + } - tuple args() && { return std::move(m_args); } + tuple args() const { + tuple result(N); + for (size_t i = 0; i < N; ++i) { + PyTuple_SET_ITEM(result.ptr(), i, handle(m_args[i]).inc_ref().ptr()); + } + return result; + } + dict kwargs() const { return {}; } /// Call a Python function and pass the collected arguments object call(PyObject *ptr) const { - PyObject *result = PyObject_CallObject(ptr, m_args.ptr()); + PyObject *result = PyObject_Vectorcall(ptr, m_args.data(), N, nullptr); if (!result) { throw error_already_set(); } @@ -2193,7 +2222,7 @@ class simple_collector { } private: - tuple m_args; + std::array m_args; }; /// Helper class which collects positional, keyword, * and ** arguments for a Python function call @@ -2328,8 +2357,8 @@ constexpr bool args_are_all_positional() { template ()>> -simple_collector collect_arguments(Args &&...args) { - return simple_collector(std::forward(args)...); +simple_collector collect_arguments(Args &&...args) { + return simple_collector(std::forward(args)...); } /// Collect all arguments, including keywords and unpacking (only instantiated when needed) diff --git a/include/pybind11/pytypes.h b/include/pybind11/pytypes.h index cee4ab5623..7db4f98b78 100644 --- a/include/pybind11/pytypes.h +++ b/include/pybind11/pytypes.h @@ -1394,7 +1394,7 @@ template using is_keyword_or_ds = satisfies_any_of; // Call argument collector forward declarations -template +template class simple_collector; template class unpacking_collector; From ddcc5d2d4b824507766dedf55d682a1c894c510a Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 14 Nov 2025 09:29:08 -0800 Subject: [PATCH 2/9] Make simple_collector non-copyable and non-movable --- include/pybind11/cast.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 8eba512b43..be235bd8d3 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2203,6 +2203,11 @@ class simple_collector { } } + simple_collector(const simple_collector &) = delete; + simple_collector(simple_collector &&) noexcept = delete; + simple_collector &operator=(const simple_collector &) = delete; + simple_collector &operator=(simple_collector &&) noexcept = delete; + tuple args() const { tuple result(N); for (size_t i = 0; i < N; ++i) { From 9120b9c38866e47263153e3c52f8359d2bab1774 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 14 Nov 2025 09:42:09 -0800 Subject: [PATCH 3/9] Restore PyObject_CallObject compatibility path for old Python versions --- include/pybind11/cast.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index be235bd8d3..59861dcd8a 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2219,7 +2219,11 @@ class simple_collector { /// Call a Python function and pass the collected arguments object call(PyObject *ptr) const { +#if PY_VERSION_HEX >= 0x03090000 PyObject *result = PyObject_Vectorcall(ptr, m_args.data(), N, nullptr); +#else + PyObject *result = PyObject_CallObject(ptr, args().ptr()); +#endif if (!result) { throw error_already_set(); } From d5a88c3d1e5f721e7ac1ccdd53f30223794ecc9a Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 14 Nov 2025 11:18:17 -0800 Subject: [PATCH 4/9] Fix the fix for Python 3.8. Allow moving of simple_collector for C++11 and 14. --- include/pybind11/cast.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 59861dcd8a..a30960e2bf 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2176,7 +2176,7 @@ class simple_collector { public: template explicit simple_collector(Ts &&...values) { - static_assert(sizeof...(Ts) == N); + static_assert(sizeof...(Ts) == N, ""); size_t i = 0; using expander = int[]; (void) expander{ @@ -2204,9 +2204,9 @@ class simple_collector { } simple_collector(const simple_collector &) = delete; - simple_collector(simple_collector &&) noexcept = delete; + simple_collector(simple_collector &&) noexcept = default; simple_collector &operator=(const simple_collector &) = delete; - simple_collector &operator=(simple_collector &&) noexcept = delete; + simple_collector &operator=(simple_collector &&) noexcept = default; tuple args() const { tuple result(N); @@ -2222,7 +2222,8 @@ class simple_collector { #if PY_VERSION_HEX >= 0x03090000 PyObject *result = PyObject_Vectorcall(ptr, m_args.data(), N, nullptr); #else - PyObject *result = PyObject_CallObject(ptr, args().ptr()); + // Use the old name for 3.8. + PyObject *result = _PyObject_Vectorcall(ptr, m_args.data(), N, nullptr); #endif if (!result) { throw error_already_set(); From 7e667622526a4b6656584134b0903a1329a0be31 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 14 Nov 2025 12:23:59 -0800 Subject: [PATCH 5/9] suppress -Wtype-limits --- include/pybind11/cast.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index a30960e2bf..797846b828 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2173,6 +2173,9 @@ class argument_loader { /// A fancier version below can collect any argument, but this one is optimal for simple calls. template class simple_collector { + // Disable warnings about useless comparisons when N == 0. + PYBIND11_WARNING_PUSH + PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") public: template explicit simple_collector(Ts &&...values) { @@ -2233,6 +2236,7 @@ class simple_collector { private: std::array m_args; + PYBIND11_WARNING_POP }; /// Helper class which collects positional, keyword, * and ** arguments for a Python function call From f6aaf686cb2da87f9f98f1e2062d6ea4e523f016 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 14 Nov 2025 12:50:54 -0800 Subject: [PATCH 6/9] suppress intel version of -Wtype-limits --- include/pybind11/cast.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 797846b828..3d6e21f008 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2176,6 +2176,7 @@ class simple_collector { // Disable warnings about useless comparisons when N == 0. PYBIND11_WARNING_PUSH PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") + PYBIND11_WARNING_DISABLE_INTEL(186) public: template explicit simple_collector(Ts &&...values) { From 7529fbedd371fbfd93c5f096579afd141ef48a24 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 14 Nov 2025 14:12:10 -0800 Subject: [PATCH 7/9] Try putting the suppression at class level --- include/pybind11/cast.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 3d6e21f008..a00b58d4b2 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2171,12 +2171,12 @@ class argument_loader { /// Helper class which collects only positional arguments for a Python function call. /// A fancier version below can collect any argument, but this one is optimal for simple calls. +// Disable warnings about useless comparisons when N == 0. +PYBIND11_WARNING_PUSH +PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") +PYBIND11_WARNING_DISABLE_INTEL(186) template class simple_collector { - // Disable warnings about useless comparisons when N == 0. - PYBIND11_WARNING_PUSH - PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") - PYBIND11_WARNING_DISABLE_INTEL(186) public: template explicit simple_collector(Ts &&...values) { From 6c1b2f7908b50949cef7ff546096ad48d98d5710 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 14 Nov 2025 14:18:09 -0800 Subject: [PATCH 8/9] try suppressing for each loop --- include/pybind11/cast.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index a00b58d4b2..059c9d0ccf 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2171,10 +2171,6 @@ class argument_loader { /// Helper class which collects only positional arguments for a Python function call. /// A fancier version below can collect any argument, but this one is optimal for simple calls. -// Disable warnings about useless comparisons when N == 0. -PYBIND11_WARNING_PUSH -PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") -PYBIND11_WARNING_DISABLE_INTEL(186) template class simple_collector { public: @@ -2189,6 +2185,9 @@ class simple_collector { .inc_ref() .ptr(), 0)...}; + PYBIND11_WARNING_PUSH + PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") + PYBIND11_WARNING_DISABLE_INTEL(186) for (i = 0; i < N; ++i) { if (!m_args[i]) { #if !defined(PYBIND11_DETAILED_ERROR_MESSAGES) @@ -2199,12 +2198,17 @@ class simple_collector { #endif } } + PYBIND11_WARNING_POP } ~simple_collector() { + PYBIND11_WARNING_PUSH + PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") + PYBIND11_WARNING_DISABLE_INTEL(186) for (size_t i = 0; i < N; ++i) { handle(m_args[i]).dec_ref(); } + PYBIND11_WARNING_POP } simple_collector(const simple_collector &) = delete; @@ -2214,9 +2218,13 @@ class simple_collector { tuple args() const { tuple result(N); + PYBIND11_WARNING_PUSH + PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") + PYBIND11_WARNING_DISABLE_INTEL(186) for (size_t i = 0; i < N; ++i) { PyTuple_SET_ITEM(result.ptr(), i, handle(m_args[i]).inc_ref().ptr()); } + PYBIND11_WARNING_POP return result; } dict kwargs() const { return {}; } @@ -2237,7 +2245,6 @@ class simple_collector { private: std::array m_args; - PYBIND11_WARNING_POP }; /// Helper class which collects positional, keyword, * and ** arguments for a Python function call From 278f081b09e60ef8696bf8fe9970539f9da45937 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 14 Nov 2025 15:02:32 -0800 Subject: [PATCH 9/9] Suppress for NVCC as well --- include/pybind11/cast.h | 3 +++ include/pybind11/detail/pybind11_namespace_macros.h | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h index 059c9d0ccf..e8742e2136 100644 --- a/include/pybind11/cast.h +++ b/include/pybind11/cast.h @@ -2188,6 +2188,7 @@ class simple_collector { PYBIND11_WARNING_PUSH PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") PYBIND11_WARNING_DISABLE_INTEL(186) + PYBIND11_WARNING_DISABLE_NVCC(186) for (i = 0; i < N; ++i) { if (!m_args[i]) { #if !defined(PYBIND11_DETAILED_ERROR_MESSAGES) @@ -2205,6 +2206,7 @@ class simple_collector { PYBIND11_WARNING_PUSH PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") PYBIND11_WARNING_DISABLE_INTEL(186) + PYBIND11_WARNING_DISABLE_NVCC(186) for (size_t i = 0; i < N; ++i) { handle(m_args[i]).dec_ref(); } @@ -2221,6 +2223,7 @@ class simple_collector { PYBIND11_WARNING_PUSH PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits") PYBIND11_WARNING_DISABLE_INTEL(186) + PYBIND11_WARNING_DISABLE_NVCC(186) for (size_t i = 0; i < N; ++i) { PyTuple_SET_ITEM(result.ptr(), i, handle(m_args[i]).inc_ref().ptr()); } diff --git a/include/pybind11/detail/pybind11_namespace_macros.h b/include/pybind11/detail/pybind11_namespace_macros.h index 6f74bf85c7..40e18d4e67 100644 --- a/include/pybind11/detail/pybind11_namespace_macros.h +++ b/include/pybind11/detail/pybind11_namespace_macros.h @@ -62,6 +62,12 @@ # define PYBIND11_WARNING_DISABLE_INTEL(name) #endif +#ifdef __CUDACC__ +# define PYBIND11_WARNING_DISABLE_NVCC(name) PYBIND11_PRAGMA(nv_diag_suppress name) +#else +# define PYBIND11_WARNING_DISABLE_NVCC(name) +#endif + #define PYBIND11_NAMESPACE_BEGIN(name) \ namespace name { \ PYBIND11_WARNING_PUSH