From cc0b0533eb60401432bab83314a71d44b9ace954 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 16 Sep 2025 16:49:17 -0700 Subject: [PATCH 1/7] Add fast_type_map, use it authoritatively for local types and as a hint for global types nanobind has a similar two-level lookup strategy, added and explained by https://github.com/wjakob/nanobind/commit/b515b1f7f2f4ecc0357818e6201c94a9f4cbfdc2 In this PR I've ported this approach to pybind11. To avoid an ABI break, I've kept the fast maps to the `local_internals`. I think this should be safe because any particular module should see its `local_internals` reset at least as often as the global `internals`, and misses in the fast "hint" map for global types fall back to the global `internals`. Performance seems to have improved. Using my patched fork of pybind11_benchmark (https://github.com/swolchok/pybind11_benchmark/tree/benchmark-updates, specifically commit hash b6613d12607104d547b1c10a8145d1b3e9937266), I run bench.py and observe the MyInt case. Each time, I do 3 runs and just report all 3. master, Mac: 75.9, 76.9, 75.3 nsec/loop this PR, Mac: 73.8, 73.8, 73.6 nsec/loop master, Linux box: 188, 187, 188 nsec/loop this PR, Linux box: 164, 165, 164 nsec/loop Note that the "real" percentage improvement is larger than implied by the above because master does not yet include #5824. --- include/pybind11/detail/class.h | 4 ++- include/pybind11/detail/internals.h | 20 ++++++++++-- include/pybind11/detail/type_caster_base.h | 38 +++++++++++++++++----- include/pybind11/pybind11.h | 18 +++++++--- tests/test_embed/external_module.cpp | 23 ++++++++++++- 5 files changed, 86 insertions(+), 17 deletions(-) diff --git a/include/pybind11/detail/class.h b/include/pybind11/detail/class.h index cd7e87f845..848141c014 100644 --- a/include/pybind11/detail/class.h +++ b/include/pybind11/detail/class.h @@ -221,10 +221,12 @@ extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { auto tindex = std::type_index(*tinfo->cpptype); internals.direct_conversions.erase(tindex); + auto &local_internals = get_local_internals(); if (tinfo->module_local) { - get_local_internals().registered_types_cpp.erase(tindex); + local_internals.registered_types_cpp.erase(tinfo->cpptype); } else { internals.registered_types_cpp.erase(tindex); + local_internals.global_registered_types_cpp_fast.erase(tinfo->cpptype); } internals.registered_types_py.erase(tinfo->type); diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index d23ee6ec91..1d8f8fd433 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,7 +39,9 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -// REMINDER for next version bump: remove loader_life_support_tls +// REMINDER for next version bump: remove loader_life_support_tls, +// move local_internals::global_registered_types_cpp_fast to +// internals::registered_types_cpp_fast # define PYBIND11_INTERNALS_VERSION 11 #endif @@ -177,6 +179,12 @@ struct type_equal_to { }; #endif +template +// REVIEW: do we need to add a fancy hash for pointers or is the +// possible identity hash function from the standard library (e.g., +// libstdc++) sufficient? +using fast_type_map = std::unordered_map; + template using type_map = std::unordered_map; @@ -302,7 +310,15 @@ struct internals { // impact any other modules, because the only things accessing the local internals is the // module that contains them. struct local_internals { - type_map registered_types_cpp; + // It should be safe to use fast_type_map here because this entire + // data structure is scoped to our single module, and thus a single + // DSO and single instance of type_info for any particular type. + fast_type_map registered_types_cpp; + + // fast hint for the *global* internals registered_types_cpp + // map. If we lookup successfully, that's the right answer; + // otherwise we go to the global map and then backfill this one. + fast_type_map global_registered_types_cpp_fast; std::forward_list registered_exception_translators; PyTypeObject *function_record_py_type = nullptr; }; diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 23d9407350..0e8565867a 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -205,35 +205,57 @@ PYBIND11_NOINLINE detail::type_info *get_type_info(PyTypeObject *type) { return bases.front(); } -inline detail::type_info *get_local_type_info(const std::type_index &tp) { - auto &locals = get_local_internals().registered_types_cpp; - auto it = locals.find(tp); +inline detail::type_info *get_local_type_info(const std::type_info &tp, + const local_internals &local_internals) { + const auto &locals = local_internals.registered_types_cpp; + auto it = locals.find(&tp); if (it != locals.end()) { return it->second; } return nullptr; } -inline detail::type_info *get_global_type_info(const std::type_index &tp) { +inline detail::type_info *get_local_type_info(const std::type_info &tp) { + return get_local_type_info(tp, get_local_internals()); +} + +inline detail::type_info *get_global_type_info(const std::type_info &tp, + local_internals &local_internals) { return with_internals([&](internals &internals) { detail::type_info *type_info = nullptr; + auto &fast_types = local_internals.global_registered_types_cpp_fast; auto &types = internals.registered_types_cpp; - auto it = types.find(tp); + auto fast_it = fast_types.find(&tp); + if (fast_it != fast_types.end()) { +#ifndef NDEBUG + auto types_it = types.find(std::type_index(tp)); + assert(types_it != types.end()); + assert(types_it->second == fast_it->second); +#endif + return fast_it->second; + } + auto it = types.find(std::type_index(tp)); if (it != types.end()) { + fast_types.emplace(&tp, it->second); type_info = it->second; } return type_info; }); } +inline detail::type_info *get_global_type_info(const std::type_info &tp) { + return get_global_type_info(tp, get_local_internals()); +} + /// Return the type info for a given C++ type; on lookup failure can either throw or return /// nullptr. -PYBIND11_NOINLINE detail::type_info *get_type_info(const std::type_index &tp, +PYBIND11_NOINLINE detail::type_info *get_type_info(const std::type_info &tp, bool throw_if_missing = false) { - if (auto *ltype = get_local_type_info(tp)) { + auto &local_internals = get_local_internals(); + if (auto *ltype = get_local_type_info(tp, local_internals)) { return ltype; } - if (auto *gtype = get_global_type_info(tp)) { + if (auto *gtype = get_global_type_info(tp, local_internals)) { return gtype; } diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 117fecabf0..058907969d 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1636,10 +1636,12 @@ class generic_type : public object { with_internals([&](internals &internals) { auto tindex = std::type_index(*rec.type); tinfo->direct_conversions = &internals.direct_conversions[tindex]; + auto &local_internals = get_local_internals(); if (rec.module_local) { - get_local_internals().registered_types_cpp[tindex] = tinfo; + local_internals.registered_types_cpp[rec.type] = tinfo; } else { internals.registered_types_cpp[tindex] = tinfo; + local_internals.global_registered_types_cpp_fast[rec.type] = tinfo; } PYBIND11_WARNING_PUSH @@ -2137,10 +2139,16 @@ class class_ : public detail::generic_type { if (has_alias) { with_internals([&](internals &internals) { - auto &instances = record.module_local ? get_local_internals().registered_types_cpp - : internals.registered_types_cpp; - instances[std::type_index(typeid(type_alias))] - = instances[std::type_index(typeid(type))]; + auto &local_internals = get_local_internals(); + if (record.module_local) { + local_internals.registered_types_cpp[&typeid(type_alias)] + = local_internals.registered_types_cpp[&typeid(type)]; + } else { + type_info *const val + = internals.registered_types_cpp[std::type_index(typeid(type))]; + internals.registered_types_cpp[std::type_index(typeid(type_alias))] = val; + local_internals.global_registered_types_cpp_fast[&typeid(type_alias)] = val; + } }); } def("_pybind11_conduit_v1_", cpp_conduit_method); diff --git a/tests/test_embed/external_module.cpp b/tests/test_embed/external_module.cpp index 3465e8b371..09b60ebbcf 100644 --- a/tests/test_embed/external_module.cpp +++ b/tests/test_embed/external_module.cpp @@ -6,11 +6,32 @@ namespace py = pybind11; * modules aren't preserved over a finalize/initialize. */ +namespace { +// Compare unsafe_reset_internals_for_single_interpreter in +// test_subinterpreter.cpp. +void unsafe_reset_local_internals() { + // NOTE: This code is NOT SAFE unless the caller guarantees no other threads are alive + // NOTE: This code is tied to the precise implementation of the internals holder + + // first, unref the thread local internals + py::detail::get_local_internals_pp_manager().unref(); + + // now we unref the static global singleton internals + py::detail::get_local_internals_pp_manager().unref(); + + // finally, we reload the static global singleton + py::detail::get_local_internals(); +} +} // namespace + PYBIND11_MODULE(external_module, m, py::mod_gil_not_used(), py::multiple_interpreters::per_interpreter_gil()) { - + // At least one test ("Single Subinterpreter") wants to reset + // internals. We have separate local internals because we are a + // separate DSO, so ours need to be reset too! + unsafe_reset_local_internals(); class A { public: explicit A(int value) : v{value} {}; From 1cd2b55e57b88e0fad4ff487f89617b10498d432 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 17 Sep 2025 11:54:10 -0700 Subject: [PATCH 2/7] simplify unsafe_reset_local_internals in test --- tests/test_embed/external_module.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_embed/external_module.cpp b/tests/test_embed/external_module.cpp index 09b60ebbcf..933ee3a6ff 100644 --- a/tests/test_embed/external_module.cpp +++ b/tests/test_embed/external_module.cpp @@ -13,13 +13,7 @@ void unsafe_reset_local_internals() { // NOTE: This code is NOT SAFE unless the caller guarantees no other threads are alive // NOTE: This code is tied to the precise implementation of the internals holder - // first, unref the thread local internals py::detail::get_local_internals_pp_manager().unref(); - - // now we unref the static global singleton internals - py::detail::get_local_internals_pp_manager().unref(); - - // finally, we reload the static global singleton py::detail::get_local_internals(); } } // namespace From abf237f9c754561f236df978f2c582b14d473240 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 17 Sep 2025 15:22:52 -0700 Subject: [PATCH 3/7] pre-implement PYBIND11_INTERNALS_VERSION 12 --- include/pybind11/detail/class.h | 4 ++- include/pybind11/detail/internals.h | 17 ++++++----- include/pybind11/detail/type_caster_base.h | 34 ++++++++++------------ include/pybind11/pybind11.h | 8 +++-- 4 files changed, 34 insertions(+), 29 deletions(-) diff --git a/include/pybind11/detail/class.h b/include/pybind11/detail/class.h index 848141c014..57af5d0633 100644 --- a/include/pybind11/detail/class.h +++ b/include/pybind11/detail/class.h @@ -226,7 +226,9 @@ extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { local_internals.registered_types_cpp.erase(tinfo->cpptype); } else { internals.registered_types_cpp.erase(tindex); - local_internals.global_registered_types_cpp_fast.erase(tinfo->cpptype); +#if PYBIND11_INTERNALS_VERSION >= 12 + internals.registered_types_cpp_fast.erase(tinfo->cpptype); +#endif } internals.registered_types_py.erase(tinfo->type); diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 1d8f8fd433..ea37e78e04 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,9 +39,6 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -// REMINDER for next version bump: remove loader_life_support_tls, -// move local_internals::global_registered_types_cpp_fast to -// internals::registered_types_cpp_fast # define PYBIND11_INTERNALS_VERSION 11 #endif @@ -245,6 +242,14 @@ struct internals { pymutex mutex; pymutex exception_translator_mutex; #endif +#if PYBIND11_INTERNALS_VERSION >= 12 + // non-normative but fast "hint" for + // registered_types_cpp. Successful lookups are correct; + // unsuccessful lookups need to try registered_types_cpp and then + // backfill this map if they find anything. + fast_type_map registered_types_cpp_fast; +#endif + // std::type_index -> pybind11's type information type_map registered_types_cpp; // PyTypeObject* -> base type_info(s) @@ -269,7 +274,9 @@ struct internals { PyObject *instance_base = nullptr; // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: thread_specific_storage tstate; +#if PYBIND11_INTERNALS_VERSION <= 11 thread_specific_storage loader_life_support_tls; // OBSOLETE (PR #5830) +#endif // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: PyInterpreterState *istate = nullptr; @@ -315,10 +322,6 @@ struct local_internals { // DSO and single instance of type_info for any particular type. fast_type_map registered_types_cpp; - // fast hint for the *global* internals registered_types_cpp - // map. If we lookup successfully, that's the right answer; - // otherwise we go to the global map and then backfill this one. - fast_type_map global_registered_types_cpp_fast; std::forward_list registered_exception_translators; PyTypeObject *function_record_py_type = nullptr; }; diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 0e8565867a..ff2a1312a7 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -205,9 +205,8 @@ PYBIND11_NOINLINE detail::type_info *get_type_info(PyTypeObject *type) { return bases.front(); } -inline detail::type_info *get_local_type_info(const std::type_info &tp, - const local_internals &local_internals) { - const auto &locals = local_internals.registered_types_cpp; +inline detail::type_info *get_local_type_info(const std::type_info &tp) { + const auto &locals = get_local_internals().registered_types_cpp; auto it = locals.find(&tp); if (it != locals.end()) { return it->second; @@ -215,47 +214,44 @@ inline detail::type_info *get_local_type_info(const std::type_info &tp, return nullptr; } -inline detail::type_info *get_local_type_info(const std::type_info &tp) { - return get_local_type_info(tp, get_local_internals()); -} - -inline detail::type_info *get_global_type_info(const std::type_info &tp, - local_internals &local_internals) { +inline detail::type_info *get_global_type_info(const std::type_info &tp) { return with_internals([&](internals &internals) { detail::type_info *type_info = nullptr; - auto &fast_types = local_internals.global_registered_types_cpp_fast; +#if PYBIND11_INTERNALS_VERSION >= 12 + auto &fast_types = internals.registered_types_cpp_fast; +#endif auto &types = internals.registered_types_cpp; +#if PYBIND11_INTERNALS_VERSION >= 12 auto fast_it = fast_types.find(&tp); if (fast_it != fast_types.end()) { -#ifndef NDEBUG +# ifndef NDEBUG auto types_it = types.find(std::type_index(tp)); assert(types_it != types.end()); assert(types_it->second == fast_it->second); -#endif +# endif return fast_it->second; } +#endif // PYBIND11_INTERNALS_VERSION >= 12 + auto it = types.find(std::type_index(tp)); if (it != types.end()) { +#if PYBIND11_INTERNALS_VERSION >= 12 fast_types.emplace(&tp, it->second); +#endif type_info = it->second; } return type_info; }); } -inline detail::type_info *get_global_type_info(const std::type_info &tp) { - return get_global_type_info(tp, get_local_internals()); -} - /// Return the type info for a given C++ type; on lookup failure can either throw or return /// nullptr. PYBIND11_NOINLINE detail::type_info *get_type_info(const std::type_info &tp, bool throw_if_missing = false) { - auto &local_internals = get_local_internals(); - if (auto *ltype = get_local_type_info(tp, local_internals)) { + if (auto *ltype = get_local_type_info(tp)) { return ltype; } - if (auto *gtype = get_global_type_info(tp, local_internals)) { + if (auto *gtype = get_global_type_info(tp)) { return gtype; } diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 058907969d..dfdcd94662 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1641,7 +1641,9 @@ class generic_type : public object { local_internals.registered_types_cpp[rec.type] = tinfo; } else { internals.registered_types_cpp[tindex] = tinfo; - local_internals.global_registered_types_cpp_fast[rec.type] = tinfo; +#if PYBIND11_INTERNALS_VERSION >= 12 + internals.registered_types_cpp_fast[rec.type] = tinfo; +#endif } PYBIND11_WARNING_PUSH @@ -2147,7 +2149,9 @@ class class_ : public detail::generic_type { type_info *const val = internals.registered_types_cpp[std::type_index(typeid(type))]; internals.registered_types_cpp[std::type_index(typeid(type_alias))] = val; - local_internals.global_registered_types_cpp_fast[&typeid(type_alias)] = val; +#if PYBIND11_INTERNALS_VERSION >= 12 + internals.registered_types_cpp_fast[&typeid(type_alias)] = val; +#endif } }); } From c333e856fc1d18fa89762d2bc7a81aeda9305e3f Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 25 Sep 2025 21:19:53 -0700 Subject: [PATCH 4/7] use PYBIND11_INTERNALS_VERSION 12 on Python 3.14 per suggestion --- include/pybind11/detail/internals.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index ea37e78e04..9d182a55b3 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,7 +39,13 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -# define PYBIND11_INTERNALS_VERSION 11 +# if PY_VERSION_HEX >= 0x030E0000 +// Get test coverage for ABI version 12 without breaking existing +// Python versions. +# define PYBIND11_INTERNALS_VERSION 12 +# else +# define PYBIND11_INTERNALS_VERSION 11 +# endif #endif #if PYBIND11_INTERNALS_VERSION < 11 From 3ca29e8a96f57c42cab2e6f12572df510518693d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 30 Sep 2025 14:24:51 -0700 Subject: [PATCH 5/7] Implement reviewer comments: revert PY_VERSION_HEX change, fix REVIEW comment, add two-level lookup comments. ci.yml coming separately --- include/pybind11/detail/internals.h | 23 +++++++++------------- include/pybind11/detail/type_caster_base.h | 4 ++++ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 9d182a55b3..ead330d286 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,13 +39,7 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -# if PY_VERSION_HEX >= 0x030E0000 -// Get test coverage for ABI version 12 without breaking existing -// Python versions. -# define PYBIND11_INTERNALS_VERSION 12 -# else -# define PYBIND11_INTERNALS_VERSION 11 -# endif +# define PYBIND11_INTERNALS_VERSION 11 #endif #if PYBIND11_INTERNALS_VERSION < 11 @@ -182,10 +176,10 @@ struct type_equal_to { }; #endif +// For now, we don't bother adding a fancy hash for pointers and just +// let the standard library use the identity hash function if that's +// what it wants to do (e.g., as in libstdc++). template -// REVIEW: do we need to add a fancy hash for pointers or is the -// possible identity hash function from the standard library (e.g., -// libstdc++) sufficient? using fast_type_map = std::unordered_map; template @@ -249,10 +243,11 @@ struct internals { pymutex exception_translator_mutex; #endif #if PYBIND11_INTERNALS_VERSION >= 12 - // non-normative but fast "hint" for - // registered_types_cpp. Successful lookups are correct; - // unsuccessful lookups need to try registered_types_cpp and then - // backfill this map if they find anything. + // non-normative but fast "hint" for registered_types_cpp. Meant + // to be used as the first level of a two-level lookup: successful + // lookups are correct, but unsuccessful lookups need to try + // registered_types_cpp and then backfill this map if they find + // anything. fast_type_map registered_types_cpp_fast; #endif diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index ff2a1312a7..de324b6c3c 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -215,6 +215,10 @@ inline detail::type_info *get_local_type_info(const std::type_info &tp) { } inline detail::type_info *get_global_type_info(const std::type_info &tp) { + // This is a two-level lookup. Hopefully we find the type info in + // registered_types_cpp_fast, but if not we try + // registered_types_cpp and fill registered_types_cpp_fast for + // next time. return with_internals([&](internals &internals) { detail::type_info *type_info = nullptr; #if PYBIND11_INTERNALS_VERSION >= 12 From 48f831bb4a0f8daa18db3fdff665cdf19016c08c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 30 Sep 2025 14:30:28 -0700 Subject: [PATCH 6/7] Use the inplace build to smoke test ABI bump? --- .github/workflows/ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 460cd82e66..ca8373b322 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -203,7 +203,8 @@ jobs: if: runner.os != 'Windows' run: echo "CMAKE_GENERATOR=Ninja" >> "$GITHUB_ENV" - # More-or-less randomly adding a few extra flags here + # More-or-less randomly adding a few extra flags here. + # In particular, use this one to smoke test the next ABI bump. - name: Configure run: > cmake -S. -B. @@ -213,6 +214,7 @@ jobs: -DDOWNLOAD_CATCH=ON -DDOWNLOAD_EIGEN=ON -DCMAKE_CXX_STANDARD=14 + -DPYBIND11_INTERNALS_VERSION=10000000 # Checks to makes sure defining `_` is allowed # Triggers EHsc missing error on Windows From 08c09ae4c526c866c4574ef3b05f60d15bd668a4 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sun, 5 Oct 2025 11:01:25 -0700 Subject: [PATCH 7/7] [skip ci] Remove "smoke" from comment. This is full testing, just only on a few platforms. --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ca8373b322..15f031ad6c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -204,7 +204,7 @@ jobs: run: echo "CMAKE_GENERATOR=Ninja" >> "$GITHUB_ENV" # More-or-less randomly adding a few extra flags here. - # In particular, use this one to smoke test the next ABI bump. + # In particular, use this one to test the next ABI bump (internals version). - name: Configure run: > cmake -S. -B.