From a9126485982724a7f807bfe09a6c09780cc47cf2 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 5 Sep 2025 09:51:41 -0700 Subject: [PATCH 1/6] Use thread_local for loader_life_support to improve performance As explained in a new code comment, `loader_life_support` needs to be `thread_local` but does not need to be isolated to a particular interpreter because any given function call is already going to only happen on a single interpreter by definiton. Performance before: - on M4 Max using pybind/pybind11_benchmark unmodified repo: ``` > python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' 5000000 loops, best of 5: 63.8 nsec per loop ``` - Linux server: ``` python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' (pytorch) 2000000 loops, best of 5: 120 nsec per loop ``` After: - M4 Max: ``` python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' 5000000 loops, best of 5: 53.1 nsec per loop ``` - Linux server: ``` > python -m timeit --setup 'from pybind11_benchmark import collatz' 'collatz(4)' (pytorch) 2000000 loops, best of 5: 101 nsec per loop ``` A quick profile with perf shows that pthread_setspecific and pthread_getspecific are gone. Open questions: - How do we determine whether we can safely use `thread_local`? I see concerns about old iOS versions on https://github.com/pybind/pybind11/pull/5705#issuecomment-2922858880 and https://github.com/pybind/pybind11/pull/5709; is there anything else? - Do we have a test that covers "function called in one interpreter calls a C++ function that causes a function call in another interpreter"? I think it's fine, but can it happen? - Are we happy with what we think will happen in the case where multiple extensions compiled with and without this PR interoperate? I think it's fine -- each dispatch pushes and cleans up its own state -- but a second opinion is certainly welcome. --- include/pybind11/detail/common.h | 3 ++ include/pybind11/detail/type_caster_base.h | 37 ++++++++++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/include/pybind11/detail/common.h b/include/pybind11/detail/common.h index 22dfc62b4c..3d2079f097 100644 --- a/include/pybind11/detail/common.h +++ b/include/pybind11/detail/common.h @@ -1344,5 +1344,8 @@ constexpr # define PYBIND11_BACKWARD_COMPATIBILITY_TP_DICTOFFSET #endif +// TODO: determine which platforms cannot use thread_local. +#define PYBIND11_CAN_USE_THREAD_LOCAL 1 + PYBIND11_NAMESPACE_END(detail) PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 1b23c5c681..76b8b23800 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -45,17 +45,48 @@ class loader_life_support { loader_life_support *parent = nullptr; std::unordered_set keep_alive; +#if defined(PYBIND11_CAN_USE_THREAD_LOCAL) + struct fake_thread_specific_storage { + loader_life_support *instance = nullptr; + loader_life_support *get() const { return instance; } + + fake_thread_specific_storage &operator=(loader_life_support *pval) { + instance = pval; + return *this; + } + }; + using loader_storage = fake_thread_specific_storage; +#else + using loader_storage = thread_specific_storage; +#endif + + static loader_storage &get_stack_top() { +#if defined(PYBIND11_CAN_USE_THREAD_LOCAL) + // Without this branch, loader_life_support destruction is a + // significant cost per function call. + // + // Observation: loader_life_support needs to be thread-local, but + // we don't need to go to extra effort to keep it per-interpreter + // (i.e., by putting it in internals) since function calls are + // already isolated to a single interpreter. + static thread_local fake_thread_specific_storage storage; + return storage; +#else + return get_internals().loader_life_support_tls; +#endif + } + public: /// A new patient frame is created when a function is entered loader_life_support() { - auto &stack_top = get_internals().loader_life_support_tls; + auto &stack_top = get_stack_top(); parent = stack_top.get(); stack_top = this; } /// ... and destroyed after it returns ~loader_life_support() { - auto &stack_top = get_internals().loader_life_support_tls; + auto &stack_top = get_stack_top(); if (stack_top.get() != this) { pybind11_fail("loader_life_support: internal error"); } @@ -68,7 +99,7 @@ class loader_life_support { /// This can only be used inside a pybind11-bound function, either by `argument_loader` /// at argument preparation time or by `py::cast()` at execution time. PYBIND11_NOINLINE static void add_patient(handle h) { - loader_life_support *frame = get_internals().loader_life_support_tls.get(); + loader_life_support *frame = get_stack_top().get(); if (!frame) { // NOTE: It would be nice to include the stack frames here, as this indicates // use of pybind11::cast<> outside the normal call framework, finding such From 728a8d9cb26e66e5d55812602b27c63ead348410 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 8 Sep 2025 09:28:30 -0700 Subject: [PATCH 2/6] Remove PYBIND11_CAN_USE_THREAD_LOCAL --- include/pybind11/detail/common.h | 3 --- include/pybind11/detail/type_caster_base.h | 13 ++----------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/include/pybind11/detail/common.h b/include/pybind11/detail/common.h index 3d2079f097..22dfc62b4c 100644 --- a/include/pybind11/detail/common.h +++ b/include/pybind11/detail/common.h @@ -1344,8 +1344,5 @@ constexpr # define PYBIND11_BACKWARD_COMPATIBILITY_TP_DICTOFFSET #endif -// TODO: determine which platforms cannot use thread_local. -#define PYBIND11_CAN_USE_THREAD_LOCAL 1 - PYBIND11_NAMESPACE_END(detail) PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 76b8b23800..11daa11ba3 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -45,7 +45,6 @@ class loader_life_support { loader_life_support *parent = nullptr; std::unordered_set keep_alive; -#if defined(PYBIND11_CAN_USE_THREAD_LOCAL) struct fake_thread_specific_storage { loader_life_support *instance = nullptr; loader_life_support *get() const { return instance; } @@ -56,24 +55,16 @@ class loader_life_support { } }; using loader_storage = fake_thread_specific_storage; -#else - using loader_storage = thread_specific_storage; -#endif static loader_storage &get_stack_top() { -#if defined(PYBIND11_CAN_USE_THREAD_LOCAL) - // Without this branch, loader_life_support destruction is a - // significant cost per function call. - // // Observation: loader_life_support needs to be thread-local, but // we don't need to go to extra effort to keep it per-interpreter // (i.e., by putting it in internals) since function calls are // already isolated to a single interpreter. + // This saves a significant cost per function call spent in + // loader_life_support destruction. static thread_local fake_thread_specific_storage storage; return storage; -#else - return get_internals().loader_life_support_tls; -#endif } public: From efc678f66cbc0f6acd950149478a3cd3d61a370e Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 8 Sep 2025 09:30:36 -0700 Subject: [PATCH 3/6] clarify comment --- include/pybind11/detail/type_caster_base.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 11daa11ba3..33cd6c2589 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -57,11 +57,13 @@ class loader_life_support { using loader_storage = fake_thread_specific_storage; static loader_storage &get_stack_top() { - // Observation: loader_life_support needs to be thread-local, but - // we don't need to go to extra effort to keep it per-interpreter - // (i.e., by putting it in internals) since function calls are - // already isolated to a single interpreter. - // This saves a significant cost per function call spent in + // Observation: loader_life_support needs to be thread-local, + // but we don't need to go to extra effort to keep it + // per-interpreter (i.e., by putting it in internals) since + // individual function calls are already isolated to a single + // interpreter, even though they could potentially call into a + // different interpreter later in the same call chain. This + // saves a significant cost per function call spent in // loader_life_support destruction. static thread_local fake_thread_specific_storage storage; return storage; From f40c22bf5d5be23e887889ee2f2aebd31118821d Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 12 Sep 2025 11:05:55 -0700 Subject: [PATCH 4/6] Simplify loader_life_support TLS storage Replace the `fake_thread_specific_storage` struct with a direct thread-local pointer managed via a function-local static: static loader_life_support *& tls_current_frame() This retains the "stack of frames" behavior via the `parent` link. It also reduces indirection and clarifies intent. Note: this form is C++11-compatible; once pybind11 requires C++17, the helper can be simplified to: inline static thread_local loader_life_support *tls_current_frame = nullptr; --- include/pybind11/detail/type_caster_base.h | 52 +++++++++------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 33cd6c2589..335fe25020 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -42,48 +42,38 @@ PYBIND11_NAMESPACE_BEGIN(detail) /// Adding a patient will keep it alive up until the enclosing function returns. class loader_life_support { private: + // Thread-local top-of-stack for loader_life_support frames (linked via parent). + // Observation: loader_life_support needs to be thread-local, + // but we don't need to go to extra effort to keep it + // per-interpreter (i.e., by putting it in internals) since + // individual function calls are already isolated to a single + // interpreter, even though they could potentially call into a + // different interpreter later in the same call chain. This + // saves a significant cost per function call spent in + // loader_life_support destruction. + // Note for future C++17 simplification: + // inline static thread_local loader_life_support *tls_current_frame = nullptr; + static loader_life_support *&tls_current_frame() { + static thread_local loader_life_support *frame_ptr = nullptr; + return frame_ptr; + } + loader_life_support *parent = nullptr; std::unordered_set keep_alive; - struct fake_thread_specific_storage { - loader_life_support *instance = nullptr; - loader_life_support *get() const { return instance; } - - fake_thread_specific_storage &operator=(loader_life_support *pval) { - instance = pval; - return *this; - } - }; - using loader_storage = fake_thread_specific_storage; - - static loader_storage &get_stack_top() { - // Observation: loader_life_support needs to be thread-local, - // but we don't need to go to extra effort to keep it - // per-interpreter (i.e., by putting it in internals) since - // individual function calls are already isolated to a single - // interpreter, even though they could potentially call into a - // different interpreter later in the same call chain. This - // saves a significant cost per function call spent in - // loader_life_support destruction. - static thread_local fake_thread_specific_storage storage; - return storage; - } - public: /// A new patient frame is created when a function is entered loader_life_support() { - auto &stack_top = get_stack_top(); - parent = stack_top.get(); - stack_top = this; + parent = tls_current_frame(); + tls_current_frame() = this; } /// ... and destroyed after it returns ~loader_life_support() { - auto &stack_top = get_stack_top(); - if (stack_top.get() != this) { + if (tls_current_frame() != this) { pybind11_fail("loader_life_support: internal error"); } - stack_top = parent; + tls_current_frame() = parent; for (auto *item : keep_alive) { Py_DECREF(item); } @@ -92,7 +82,7 @@ class loader_life_support { /// This can only be used inside a pybind11-bound function, either by `argument_loader` /// at argument preparation time or by `py::cast()` at execution time. PYBIND11_NOINLINE static void add_patient(handle h) { - loader_life_support *frame = get_stack_top().get(); + loader_life_support *frame = tls_current_frame(); if (!frame) { // NOTE: It would be nice to include the stack frames here, as this indicates // use of pybind11::cast<> outside the normal call framework, finding such From 6bdc29f9bfeff8c7a29e64a14b7e5fb45c723bba Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 12 Sep 2025 11:30:37 -0700 Subject: [PATCH 5/6] loader_life_support: avoid duplicate tls_current_frame() calls Replace repeated calls with a single local reference: auto &frame = tls_current_frame(); This ensures the thread_local initialization guard is checked only once per constructor/destructor call site, avoids potential clang-tidy complaints, and makes the code more readable. Functional behavior is unchanged. --- include/pybind11/detail/type_caster_base.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 335fe25020..23d9407350 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -64,16 +64,18 @@ class loader_life_support { public: /// A new patient frame is created when a function is entered loader_life_support() { - parent = tls_current_frame(); - tls_current_frame() = this; + auto &frame = tls_current_frame(); + parent = frame; + frame = this; } /// ... and destroyed after it returns ~loader_life_support() { - if (tls_current_frame() != this) { + auto &frame = tls_current_frame(); + if (frame != this) { pybind11_fail("loader_life_support: internal error"); } - tls_current_frame() = parent; + frame = parent; for (auto *item : keep_alive) { Py_DECREF(item); } From fb13fdde4c976523f83be2988e426ae4b0480906 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 12 Sep 2025 11:44:21 -0700 Subject: [PATCH 6/6] Add REMINDER for next version bump in internals.h --- include/pybind11/detail/internals.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 4ea374861e..cd3afdfe36 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,6 +39,7 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION +// REMINDER for next version bump: remove loader_life_support_tls # define PYBIND11_INTERNALS_VERSION 11 #endif @@ -260,7 +261,7 @@ struct internals { PyObject *instance_base = nullptr; // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: thread_specific_storage tstate; - thread_specific_storage loader_life_support_tls; + thread_specific_storage loader_life_support_tls; // OBSOLETE (PR #5830) // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: PyInterpreterState *istate = nullptr;