From 3276da0b1021b22d209b8c19bb37ce1ef410b01d Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sun, 5 Oct 2025 03:07:03 +0300
Subject: [PATCH 01/14] no adaptivity, but some parts are in and all good-ish

---
 Objects/listobject.c | 100 +++++++++++++++++++++++++++++--------------
 1 file changed, 68 insertions(+), 32 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 5905a6d335b311..bc2bde64ef8b96 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1828,47 +1828,20 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             v[M + 1] = vpivot;
     }
 #else // binary insertion sort
-    Py_ssize_t L, R;
-    for (; ok < n; ++ok) {
-        /* set L to where a[ok] belongs */
-        L = 0;
-        R = ok;
+
+    // Known: a[ok] < a[ok - 1]
+    if (ok < n) {
+        Py_ssize_t L = 0;
+        Py_ssize_t R = ok - 1;
         pivot = a[ok];
-        /* Slice invariants. vacuously true at the start:
-         * all a[0:L]  <= pivot
-         * all a[L:R]     unknown
-         * all a[R:ok]  > pivot
-         */
-        assert(L < R);
         do {
-            /* don't do silly ;-) things to prevent overflow when finding
-               the midpoint; L and R are very far from filling a Py_ssize_t */
             M = (L + R) >> 1;
-#if 1 // straightforward, but highly unpredictable branch on random data
             IFLT(pivot, a[M])
                 R = M;
             else
                 L = M + 1;
-#else
-            /* Try to get compiler to generate conditional move instructions
-               instead. Works fine, but leaving it disabled for now because
-               it's not yielding consistently faster sorts. Needs more
-               investigation. More computation in the inner loop adds its own
-               costs, which can be significant when compares are fast. */
-            k = ISLT(pivot, a[M]);
-            if (k < 0)
-                goto fail;
-            Py_ssize_t Mp1 = M + 1;
-            R = k ? M : R;
-            L = k ? L : Mp1;
-#endif
         } while (L < R);
         assert(L == R);
-        /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
-           at index L. Slide a[L:ok] to the right a slot to make room for it.
-           Caution: using memmove is much slower under MSVC 5; we're not
-           usually moving many slots. Years later: under Visual Studio 2022,
-           memmove seems just slightly slower than doing it "by hand". */
         for (M = ok; M > L; --M)
             a[M] = a[M - 1];
         a[L] = pivot;
@@ -1878,6 +1851,69 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
                 v[M] = v[M - 1];
             v[L] = pivot;
         }
+
+        // Update Adaptive runvars
+        Py_ssize_t std = ok >> 1;
+        Py_ssize_t mu = L;
+        ++ok;
+
+        for (; ok < n; ++ok) {
+            /* set L to where a[ok] belongs */
+            L = 0;
+            R = ok;
+            pivot = a[ok];
+            /* Slice invariants. vacuously true at the start:
+             * all a[0:L]  <= pivot
+             * all a[L:R]     unknown
+             * all a[R:ok]  > pivot
+             */
+            assert(L < R);
+            M = (L + R) >> 1;
+            // M = std < (ok >> 2) ? mu : (L + R) >> 1;
+            do {
+                /* don't do silly ;-) things to prevent overflow when finding
+                   the midpoint; L and R are very far from filling a Py_ssize_t */
+
+#if 1 // straightforward, but highly unpredictable branch on random data
+                IFLT(pivot, a[M])
+                    R = M;
+                else
+                    L = M + 1;
+                M = (L + R) >> 1;
+#else
+                /* Try to get compiler to generate conditional move instructions
+                   instead. Works fine, but leaving it disabled for now because
+                   it's not yielding consistently faster sorts. Needs more
+                   investigation. More computation in the inner loop adds its own
+                   costs, which can be significant when compares are fast. */
+                k = ISLT(pivot, a[M]);
+                if (k < 0)
+                    goto fail;
+                Py_ssize_t Mp1 = M + 1;
+                R = k ? M : R;
+                L = k ? L : Mp1;
+#endif
+            } while (L < R);
+            assert(L == R);
+            /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
+               at index L. Slide a[L:ok] to the right a slot to make room for it.
+               Caution: using memmove is much slower under MSVC 5; we're not
+               usually moving many slots. Years later: under Visual Studio 2022,
+               memmove seems just slightly slower than doing it "by hand". */
+            for (M = ok; M > L; --M)
+                a[M] = a[M - 1];
+            a[L] = pivot;
+            if (has_values) {
+                pivot = v[ok];
+                for (M = ok; M > L; --M)
+                    v[M] = v[M - 1];
+                v[L] = pivot;
+            }
+
+            // Update Adaptive runvars
+            std = labs(L - mu);
+            mu = L;
+        }
     }
 #endif // pick binary or regular insertion sort
     return 0;

From e63a54bd9154b28099ef04d306616bc1429304a3 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sun, 5 Oct 2025 05:01:08 +0300
Subject: [PATCH 02/14] only 1cmp shave left active

---
 Objects/listobject.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index bc2bde64ef8b96..31a69e293d4fd7 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1834,6 +1834,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
         Py_ssize_t L = 0;
         Py_ssize_t R = ok - 1;
         pivot = a[ok];
+        assert(L < R);
         do {
             M = (L + R) >> 1;
             IFLT(pivot, a[M])
@@ -1853,8 +1854,8 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
         }
 
         // Update Adaptive runvars
-        Py_ssize_t std = ok >> 1;
-        Py_ssize_t mu = L;
+        // Py_ssize_t std = ok >> 1;
+        // Py_ssize_t mu = L;
         ++ok;
 
         for (; ok < n; ++ok) {
@@ -1868,18 +1869,16 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
              * all a[R:ok]  > pivot
              */
             assert(L < R);
-            M = (L + R) >> 1;
-            // M = std < (ok >> 2) ? mu : (L + R) >> 1;
             do {
                 /* don't do silly ;-) things to prevent overflow when finding
                    the midpoint; L and R are very far from filling a Py_ssize_t */
 
 #if 1 // straightforward, but highly unpredictable branch on random data
+                M = (L + R) >> 1;
                 IFLT(pivot, a[M])
                     R = M;
                 else
                     L = M + 1;
-                M = (L + R) >> 1;
 #else
                 /* Try to get compiler to generate conditional move instructions
                    instead. Works fine, but leaving it disabled for now because
@@ -1911,8 +1910,9 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             }
 
             // Update Adaptive runvars
-            std = labs(L - mu);
-            mu = L;
+            // std += labs(L - mu);
+            // std /= 2;
+            // mu = L;
         }
     }
 #endif // pick binary or regular insertion sort

From 9a212e2804054ce3749b7e585fa6aa374c79c338 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sat, 11 Oct 2025 21:53:07 +0300
Subject: [PATCH 03/14] adaptivity v2

---
 Objects/listobject.c | 389 ++++++++++++++++++++++++++++++-------------
 1 file changed, 276 insertions(+), 113 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 31a69e293d4fd7..c56ab43e391330 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1764,6 +1764,134 @@ struct s_MergeState {
    Even in case of error, the output slice will be some permutation of
    the input (nothing is lost or duplicated).
 */
+// static int
+// binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
+// {
+//     Py_ssize_t k; /* for IFLT macro expansion */
+//     PyObject ** const a = ss->keys;
+//     PyObject ** const v = ss->values;
+//     const bool has_values = v != NULL;
+//     PyObject *pivot;
+//     Py_ssize_t M;
+
+//     assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
+//     /* assert a[:ok] is sorted */
+//     if (! ok)
+//         ++ok;
+//     /* Regular insertion sort has average- and worst-case O(n**2) cost
+//        for both # of comparisons and number of bytes moved. But its branches
+//        are highly predictable, and it loves sorted input (n-1 compares and no
+//        data movement). This is significant in cases like sortperf.py's %sort,
+//        where an out-of-order element near the start of a run is moved into
+//        place slowly but then the remaining elements up to length minrun are
+//        generally at worst one slot away from their correct position (so only
+//        need 1 or 2 commpares to resolve). If comparisons are very fast (such
+//        as for a list of Python floats), the simple inner loop leaves it
+//        very competitive with binary insertion, despite that it does
+//        significantly more compares overall on random data.
+
+//        Binary insertion sort has worst, average, and best case O(n log n)
+//        cost for # of comparisons, but worst and average case O(n**2) cost
+//        for data movement. The more expensive comparisons, the more important
+//        the comparison advantage. But its branches are less predictable the
+//        more "randomish" the data, and that's so significant its worst case
+//        in real life is random input rather than reverse-ordered (which does
+//        about twice the data movement than random input does).
+
+//        Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN
+//        of 64 is so small that the key and value pointers all fit in a corner
+//        of L1 cache, and moving things around in that is very fast. */
+// #if 0 // ordinary insertion sort.
+//     PyObject * vpivot = NULL;
+//     for (; ok < n; ++ok) {
+//         pivot = a[ok];
+//         if (has_values)
+//             vpivot = v[ok];
+//         for (M = ok - 1; M >= 0; --M) {
+//             k = ISLT(pivot, a[M]);
+//             if (k < 0) {
+//                 a[M + 1] = pivot;
+//                 if (has_values)
+//                     v[M + 1] = vpivot;
+//                 goto fail;
+//             }
+//             else if (k) {
+//                 a[M + 1] = a[M];
+//                 if (has_values)
+//                     v[M + 1] = v[M];
+//             }
+//             else
+//                 break;
+//         }
+//         a[M + 1] = pivot;
+//         if (has_values)
+//             v[M + 1] = vpivot;
+//     }
+// #else // binary insertion sort
+//     Py_ssize_t L, R;
+//     for (; ok < n; ++ok) {
+//         /* set L to where a[ok] belongs */
+//         L = 0;
+//         R = ok;
+//         pivot = a[ok];
+//         /* Slice invariants. vacuously true at the start:
+//          * all a[0:L]  <= pivot
+//          * all a[L:R]     unknown
+//          * all a[R:ok]  > pivot
+//          */
+//         assert(L < R);
+//         do {
+//             /* don't do silly ;-) things to prevent overflow when finding
+//                the midpoint; L and R are very far from filling a Py_ssize_t */
+
+// #if 1 // straightforward, but highly unpredictable branch on random data
+//             M = (L + R) >> 1;
+//             IFLT(pivot, a[M])
+//                 R = M;
+//             else
+//                 L = M + 1;
+// #else
+//             /* Try to get compiler to generate conditional move instructions
+//                instead. Works fine, but leaving it disabled for now because
+//                it's not yielding consistently faster sorts. Needs more
+//                investigation. More computation in the inner loop adds its own
+//                costs, which can be significant when compares are fast. */
+//             k = ISLT(pivot, a[M]);
+//             if (k < 0)
+//                 goto fail;
+//             Py_ssize_t Mp1 = M + 1;
+//             R = k ? M : R;
+//             L = k ? L : Mp1;
+// #endif
+//         } while (L < R);
+//         assert(L == R);
+//         /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
+//            at index L. Slide a[L:ok] to the right a slot to make room for it.
+//            Caution: using memmove is much slower under MSVC 5; we're not
+//            usually moving many slots. Years later: under Visual Studio 2022,
+//            memmove seems just slightly slower than doing it "by hand". */
+//         for (M = ok; M > L; --M)
+//             a[M] = a[M - 1];
+//         a[L] = pivot;
+//         if (has_values) {
+//             pivot = v[ok];
+//             for (M = ok; M > L; --M)
+//                 v[M] = v[M - 1];
+//             v[L] = pivot;
+//         }
+
+//         // Update Adaptive runvars
+//         // std += labs(L - mu);
+//         // std /= 2;
+//         // mu = L;
+//     }
+// #endif // pick binary or regular insertion sort
+//     return 0;
+
+//  fail:
+//     return -1;
+// }
+
 static int
 binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
 {
@@ -1776,129 +1904,55 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
 
     assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
     /* assert a[:ok] is sorted */
-    if (! ok)
-        ++ok;
-    /* Regular insertion sort has average- and worst-case O(n**2) cost
-       for both # of comparisons and number of bytes moved. But its branches
-       are highly predictable, and it loves sorted input (n-1 compares and no
-       data movement). This is significant in cases like sortperf.py's %sort,
-       where an out-of-order element near the start of a run is moved into
-       place slowly but then the remaining elements up to length minrun are
-       generally at worst one slot away from their correct position (so only
-       need 1 or 2 commpares to resolve). If comparisons are very fast (such
-       as for a list of Python floats), the simple inner loop leaves it
-       very competitive with binary insertion, despite that it does
-       significantly more compares overall on random data.
-
-       Binary insertion sort has worst, average, and best case O(n log n)
-       cost for # of comparisons, but worst and average case O(n**2) cost
-       for data movement. The more expensive comparisons, the more important
-       the comparison advantage. But its branches are less predictable the
-       more "randomish" the data, and that's so significant its worst case
-       in real life is random input rather than reverse-ordered (which does
-       about twice the data movement than random input does).
-
-       Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN
-       of 64 is so small that the key and value pointers all fit in a corner
-       of L1 cache, and moving things around in that is very fast. */
-#if 0 // ordinary insertion sort.
-    PyObject * vpivot = NULL;
-    for (; ok < n; ++ok) {
-        pivot = a[ok];
-        if (has_values)
-            vpivot = v[ok];
-        for (M = ok - 1; M >= 0; --M) {
-            k = ISLT(pivot, a[M]);
-            if (k < 0) {
-                a[M + 1] = pivot;
-                if (has_values)
-                    v[M + 1] = vpivot;
-                goto fail;
-            }
-            else if (k) {
-                a[M + 1] = a[M];
-                if (has_values)
-                    v[M + 1] = v[M];
-            }
-            else
-                break;
-        }
-        a[M + 1] = pivot;
-        if (has_values)
-            v[M + 1] = vpivot;
-    }
-#else // binary insertion sort
+    ok += !ok;
 
-    // Known: a[ok] < a[ok - 1]
-    if (ok < n) {
-        Py_ssize_t L = 0;
-        Py_ssize_t R = ok - 1;
-        pivot = a[ok];
-        assert(L < R);
-        do {
-            M = (L + R) >> 1;
-            IFLT(pivot, a[M])
-                R = M;
-            else
-                L = M + 1;
-        } while (L < R);
-        assert(L == R);
-        for (M = ok; M > L; --M)
-            a[M] = a[M - 1];
-        a[L] = pivot;
-        if (has_values) {
-            pivot = v[ok];
-            for (M = ok; M > L; --M)
-                v[M] = v[M - 1];
-            v[L] = pivot;
-        }
+    if (ok == n)
+        return 0;
 
-        // Update Adaptive runvars
-        // Py_ssize_t std = ok >> 1;
-        // Py_ssize_t mu = L;
-        ++ok;
+    // 1, Known: a[ok] < a[ok - 1]
+    Py_ssize_t L = 0;
+    Py_ssize_t R = ok - 1;
+    pivot = a[ok];
+    assert(L < R);
+    do {
+        M = (L + R) >> 1;
+        IFLT(pivot, a[M])
+            R = M;
+        else
+            L = M + 1;
+    } while (L < R);
+    assert(L == R);
+    for (M = ok; M > L; --M)
+        a[M] = a[M - 1];
+    a[L] = pivot;
+    if (has_values) {
+        pivot = v[ok];
+        for (M = ok; M > L; --M)
+            v[M] = v[M - 1];
+        v[L] = pivot;
+    }
+    ++ok;
 
-        for (; ok < n; ++ok) {
-            /* set L to where a[ok] belongs */
+    // 2. Non-adaptive run of 5
+    Py_ssize_t mu = L;
+    Py_ssize_t std = ok >> 1;
+    Py_ssize_t m = ok + 5;      // NOTE: a) Calibrate
+    if (m < n) {
+        for (; ok < m; ++ok) {
             L = 0;
             R = ok;
             pivot = a[ok];
-            /* Slice invariants. vacuously true at the start:
-             * all a[0:L]  <= pivot
-             * all a[L:R]     unknown
-             * all a[R:ok]  > pivot
-             */
+
             assert(L < R);
             do {
-                /* don't do silly ;-) things to prevent overflow when finding
-                   the midpoint; L and R are very far from filling a Py_ssize_t */
-
-#if 1 // straightforward, but highly unpredictable branch on random data
                 M = (L + R) >> 1;
                 IFLT(pivot, a[M])
                     R = M;
                 else
                     L = M + 1;
-#else
-                /* Try to get compiler to generate conditional move instructions
-                   instead. Works fine, but leaving it disabled for now because
-                   it's not yielding consistently faster sorts. Needs more
-                   investigation. More computation in the inner loop adds its own
-                   costs, which can be significant when compares are fast. */
-                k = ISLT(pivot, a[M]);
-                if (k < 0)
-                    goto fail;
-                Py_ssize_t Mp1 = M + 1;
-                R = k ? M : R;
-                L = k ? L : Mp1;
-#endif
             } while (L < R);
             assert(L == R);
-            /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
-               at index L. Slide a[L:ok] to the right a slot to make room for it.
-               Caution: using memmove is much slower under MSVC 5; we're not
-               usually moving many slots. Years later: under Visual Studio 2022,
-               memmove seems just slightly slower than doing it "by hand". */
+
             for (M = ok; M > L; --M)
                 a[M] = a[M - 1];
             a[L] = pivot;
@@ -1909,13 +1963,122 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
                 v[L] = pivot;
             }
 
-            // Update Adaptive runvars
-            // std += labs(L - mu);
-            // std /= 2;
-            // mu = L;
+            std += labs(L - mu);
+            std /= 2;
+            mu = L;
+        }
+
+        // 2. Maybe-adaptive run
+        Py_ssize_t std_max = ok >> 2;   // NOTE: b) Calibrate
+        if (std <= std_max) {
+            for (; ok < n; ++ok) {
+                pivot = a[ok];
+                mu = L;
+
+                IFLT(pivot, a[mu]) {
+                    L = 0;
+                    R = mu;
+                    if (L < R) {
+                        std += !std;
+                        M = R - std;
+                        if (M < L)
+                            M = L;
+                        IFLT(pivot, a[M]) {
+                            R = M;
+                            if (L < R) {
+                                M = R - std;
+                                if (M < L)
+                                    M = L;
+                                IFLT(pivot, a[M])
+                                    R = M;
+                                else
+                                    L = M + 1;
+                            }
+                        }
+                        else {
+                            L = M + 1;
+                        }
+                    }
+                }
+                else {
+                    L = mu + 1;
+                    R = ok;
+                    if (L < R) {
+                        M = L + std;
+                        if (M >= R)
+                            M = R - 1;
+                        IFLT(pivot, a[M]) {
+                            R = M;
+                        }
+                        else {
+                            L = M + 1;
+                            if (L < R) {
+                                M = L + std;
+                                if (M >= R)
+                                    M = R - 1;
+                                IFLT(pivot, a[M])
+                                    R = M;
+                                else
+                                    L = M + 1;
+                            }
+                        }
+                    }
+                }
+                // Binary Insertion
+                while (L < R) {
+                    M = (L + R) >> 1;
+                    IFLT(pivot, a[M])
+                        R = M;
+                    else
+                        L = M + 1;
+                }
+
+                for (M = ok; M > L; --M)
+                    a[M] = a[M - 1];
+                a[L] = pivot;
+                if (has_values) {
+                    pivot = v[ok];
+                    for (M = ok; M > L; --M)
+                        v[M] = v[M - 1];
+                    v[L] = pivot;
+                }
+
+                std += labs(L - mu);
+                std /= 2;
+                std_max += !(ok % 4);
+                if (std > std_max) {
+                    ++ok;
+                    break;
+                }
+            }
+        }
+    }
+    // 3. Finish off with simple binary
+    for (; ok < n; ++ok) {
+        L = 0;
+        R = ok;
+        pivot = a[ok];
+
+        assert(L < R);
+        do {
+            M = (L + R) >> 1;
+            IFLT(pivot, a[M])
+                R = M;
+            else
+                L = M + 1;
+        } while (L < R);
+        assert(L == R);
+
+        for (M = ok; M > L; --M)
+            a[M] = a[M - 1];
+        a[L] = pivot;
+        if (has_values) {
+            pivot = v[ok];
+            for (M = ok; M > L; --M)
+                v[M] = v[M - 1];
+            v[L] = pivot;
         }
     }
-#endif // pick binary or regular insertion sort
     return 0;
 
  fail:

From 74f55287f5ba2af872868d054709096d7a67e5be Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sat, 11 Oct 2025 22:15:53 +0300
Subject: [PATCH 04/14] noop

---
 Objects/listobject.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index c56ab43e391330..99737650effb0e 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1933,11 +1933,11 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
     }
     ++ok;
 
-    // 2. Non-adaptive run of 5
     Py_ssize_t mu = L;
     Py_ssize_t std = ok >> 1;
     Py_ssize_t m = ok + 5;      // NOTE: a) Calibrate
     if (m < n) {
+        // 2. Non-adaptive run of 5
         for (; ok < m; ++ok) {
             L = 0;
             R = ok;
@@ -1968,7 +1968,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             mu = L;
         }
 
-        // 2. Maybe-adaptive run
+        // 3. Maybe-adaptive run
         Py_ssize_t std_max = ok >> 2;   // NOTE: b) Calibrate
         if (std <= std_max) {
             for (; ok < n; ++ok) {
@@ -2053,7 +2053,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             }
         }
     }
-    // 3. Finish off with simple binary
+    // 4. Finish off with simple binary
     for (; ok < n; ++ok) {
         L = 0;
         R = ok;

From 52217f3dacc4dfdb152f9709afd0412848556a35 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sun, 12 Oct 2025 12:06:09 +0300
Subject: [PATCH 05/14] stable non-invasive implementation

---
 Objects/listobject.c | 427 +++++++++++++++++++------------------------
 Objects/listsort.txt |   4 +
 2 files changed, 192 insertions(+), 239 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 99737650effb0e..481b40dea95837 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1764,134 +1764,6 @@ struct s_MergeState {
    Even in case of error, the output slice will be some permutation of
    the input (nothing is lost or duplicated).
 */
-// static int
-// binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
-// {
-//     Py_ssize_t k; /* for IFLT macro expansion */
-//     PyObject ** const a = ss->keys;
-//     PyObject ** const v = ss->values;
-//     const bool has_values = v != NULL;
-//     PyObject *pivot;
-//     Py_ssize_t M;
-
-//     assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
-//     /* assert a[:ok] is sorted */
-//     if (! ok)
-//         ++ok;
-//     /* Regular insertion sort has average- and worst-case O(n**2) cost
-//        for both # of comparisons and number of bytes moved. But its branches
-//        are highly predictable, and it loves sorted input (n-1 compares and no
-//        data movement). This is significant in cases like sortperf.py's %sort,
-//        where an out-of-order element near the start of a run is moved into
-//        place slowly but then the remaining elements up to length minrun are
-//        generally at worst one slot away from their correct position (so only
-//        need 1 or 2 commpares to resolve). If comparisons are very fast (such
-//        as for a list of Python floats), the simple inner loop leaves it
-//        very competitive with binary insertion, despite that it does
-//        significantly more compares overall on random data.
-
-//        Binary insertion sort has worst, average, and best case O(n log n)
-//        cost for # of comparisons, but worst and average case O(n**2) cost
-//        for data movement. The more expensive comparisons, the more important
-//        the comparison advantage. But its branches are less predictable the
-//        more "randomish" the data, and that's so significant its worst case
-//        in real life is random input rather than reverse-ordered (which does
-//        about twice the data movement than random input does).
-
-//        Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN
-//        of 64 is so small that the key and value pointers all fit in a corner
-//        of L1 cache, and moving things around in that is very fast. */
-// #if 0 // ordinary insertion sort.
-//     PyObject * vpivot = NULL;
-//     for (; ok < n; ++ok) {
-//         pivot = a[ok];
-//         if (has_values)
-//             vpivot = v[ok];
-//         for (M = ok - 1; M >= 0; --M) {
-//             k = ISLT(pivot, a[M]);
-//             if (k < 0) {
-//                 a[M + 1] = pivot;
-//                 if (has_values)
-//                     v[M + 1] = vpivot;
-//                 goto fail;
-//             }
-//             else if (k) {
-//                 a[M + 1] = a[M];
-//                 if (has_values)
-//                     v[M + 1] = v[M];
-//             }
-//             else
-//                 break;
-//         }
-//         a[M + 1] = pivot;
-//         if (has_values)
-//             v[M + 1] = vpivot;
-//     }
-// #else // binary insertion sort
-//     Py_ssize_t L, R;
-//     for (; ok < n; ++ok) {
-//         /* set L to where a[ok] belongs */
-//         L = 0;
-//         R = ok;
-//         pivot = a[ok];
-//         /* Slice invariants. vacuously true at the start:
-//          * all a[0:L]  <= pivot
-//          * all a[L:R]     unknown
-//          * all a[R:ok]  > pivot
-//          */
-//         assert(L < R);
-//         do {
-//             /* don't do silly ;-) things to prevent overflow when finding
-//                the midpoint; L and R are very far from filling a Py_ssize_t */
-
-// #if 1 // straightforward, but highly unpredictable branch on random data
-//             M = (L + R) >> 1;
-//             IFLT(pivot, a[M])
-//                 R = M;
-//             else
-//                 L = M + 1;
-// #else
-//             /* Try to get compiler to generate conditional move instructions
-//                instead. Works fine, but leaving it disabled for now because
-//                it's not yielding consistently faster sorts. Needs more
-//                investigation. More computation in the inner loop adds its own
-//                costs, which can be significant when compares are fast. */
-//             k = ISLT(pivot, a[M]);
-//             if (k < 0)
-//                 goto fail;
-//             Py_ssize_t Mp1 = M + 1;
-//             R = k ? M : R;
-//             L = k ? L : Mp1;
-// #endif
-//         } while (L < R);
-//         assert(L == R);
-//         /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
-//            at index L. Slide a[L:ok] to the right a slot to make room for it.
-//            Caution: using memmove is much slower under MSVC 5; we're not
-//            usually moving many slots. Years later: under Visual Studio 2022,
-//            memmove seems just slightly slower than doing it "by hand". */
-//         for (M = ok; M > L; --M)
-//             a[M] = a[M - 1];
-//         a[L] = pivot;
-//         if (has_values) {
-//             pivot = v[ok];
-//             for (M = ok; M > L; --M)
-//                 v[M] = v[M - 1];
-//             v[L] = pivot;
-//         }
-
-//         // Update Adaptive runvars
-//         // std += labs(L - mu);
-//         // std /= 2;
-//         // mu = L;
-//     }
-// #endif // pick binary or regular insertion sort
-//     return 0;
-
-//  fail:
-//     return -1;
-// }
-
 static int
 binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
 {
@@ -1904,171 +1776,247 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
 
     assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
     /* assert a[:ok] is sorted */
-    ok += !ok;
+    if (! ok)
+        ++ok;
 
-    if (ok == n)
+#if 1   // Adaptivity with post `count_run` optimization of 1st pivot
+    // 1. Known: a[ok] < a[ok - 1], as called after `count_run`
+    if (ok >= n)
         return 0;
-
-    // 1, Known: a[ok] < a[ok - 1]
-    Py_ssize_t L = 0;
-    Py_ssize_t R = ok - 1;
+    Py_ssize_t aL = 0;
+    Py_ssize_t aR = ok - 1;
     pivot = a[ok];
-    assert(L < R);
+    assert(aL < aR);
     do {
-        M = (L + R) >> 1;
+        M = (aL + aR) >> 1;
         IFLT(pivot, a[M])
-            R = M;
+            aR = M;
         else
-            L = M + 1;
-    } while (L < R);
-    assert(L == R);
-    for (M = ok; M > L; --M)
+            aL = M + 1;
+    } while (aL < aR);
+    assert(aL == aR);
+    for (M = ok; M > aL; --M)
         a[M] = a[M - 1];
-    a[L] = pivot;
+    a[aL] = pivot;
     if (has_values) {
         pivot = v[ok];
-        for (M = ok; M > L; --M)
+        for (M = ok; M > aL; --M)
             v[M] = v[M - 1];
-        v[L] = pivot;
+        v[aL] = pivot;
     }
     ++ok;
 
-    Py_ssize_t mu = L;
-    Py_ssize_t std = ok >> 1;
-    Py_ssize_t m = ok + 5;      // NOTE: a) Calibrate
+    Py_ssize_t m = ok < 5 ? 11 : ok + 6;
     if (m < n) {
-        // 2. Non-adaptive run of 5
+        // 2. Small non-adaptive run to acquire good `std` estimate
+        Py_ssize_t mu = aL;
+        Py_ssize_t std = ok >> 1;
         for (; ok < m; ++ok) {
-            L = 0;
-            R = ok;
+            aL = 0;
+            aR = ok;
             pivot = a[ok];
 
-            assert(L < R);
+            assert(aL < aR);
             do {
-                M = (L + R) >> 1;
+                M = (aL + aR) >> 1;
                 IFLT(pivot, a[M])
-                    R = M;
+                    aR = M;
                 else
-                    L = M + 1;
-            } while (L < R);
-            assert(L == R);
+                    aL = M + 1;
+            } while (aL < aR);
+            assert(aL == aR);
 
-            for (M = ok; M > L; --M)
+            for (M = ok; M > aL; --M)
                 a[M] = a[M - 1];
-            a[L] = pivot;
+            a[aL] = pivot;
             if (has_values) {
                 pivot = v[ok];
-                for (M = ok; M > L; --M)
+                for (M = ok; M > aL; --M)
                     v[M] = v[M - 1];
-                v[L] = pivot;
+                v[aL] = pivot;
             }
 
-            std += labs(L - mu);
+            std += labs(aL - mu);
             std /= 2;
-            mu = L;
+            mu = aL;
         }
 
-        // 3. Maybe-adaptive run
-        Py_ssize_t std_max = ok >> 2;   // NOTE: b) Calibrate
-        if (std <= std_max) {
-            for (; ok < n; ++ok) {
-                pivot = a[ok];
-                mu = L;
-
-                IFLT(pivot, a[mu]) {
-                    L = 0;
-                    R = mu;
-                    if (L < R) {
-                        std += !std;
-                        M = R - std;
-                        if (M < L)
-                            M = L;
-                        IFLT(pivot, a[M]) {
-                            R = M;
-                            if (L < R) {
-                                M = R - std;
-                                if (M < L)
-                                    M = L;
-                                IFLT(pivot, a[M])
-                                    R = M;
-                                else
-                                    L = M + 1;
-                            }
-                        }
-                        else {
-                            L = M + 1;
+        // 3. Adaptive routine while `std` is small enough
+        Py_ssize_t std_max = ok >> 2;
+        for (; ok < n && std <= std_max; ++ok) {
+            pivot = a[ok];
+
+            IFLT(pivot, a[mu]) {
+                aL = 0;
+                aR = mu;
+                if (aL < aR) {
+                    std += !std;
+                    M = aR - std;
+                    if (M < aL)
+                        M = aL;
+                    IFLT(pivot, a[M]) {
+                        aR = M;
+                        if (aL < aR) {
+                            M = aR - std;
+                            if (M < aL)
+                                M = aL;
+                            IFLT(pivot, a[M])
+                                aR = M;
+                            else
+                                aL = M + 1;
                         }
                     }
+                    else {
+                        aL = M + 1;
+                    }
                 }
-                else {
-                    L = mu + 1;
-                    R = ok;
-                    if (L < R) {
-                        M = L + std;
-                        if (M >= R)
-                            M = R - 1;
-                        IFLT(pivot, a[M]) {
-                            R = M;
-                        }
-                        else {
-                            L = M + 1;
-                            if (L < R) {
-                                M = L + std;
-                                if (M >= R)
-                                    M = R - 1;
-                                IFLT(pivot, a[M])
-                                    R = M;
-                                else
-                                    L = M + 1;
-                            }
+            }
+            else {
+                aL = mu + 1;
+                aR = ok;
+                if (aL < aR) {
+                    M = aL + std;
+                    if (M >= aR)
+                        M = aR - 1;
+                    IFLT(pivot, a[M]) {
+                        aR = M;
+                    }
+                    else {
+                        aL = M + 1;
+                        if (aL < aR) {
+                            M = aL + std;
+                            if (M >= aR)
+                                M = aR - 1;
+                            IFLT(pivot, a[M])
+                                aR = M;
+                            else
+                                aL = M + 1;
                         }
                     }
                 }
-                // Binary Insertion
-                while (L < R) {
-                    M = (L + R) >> 1;
-                    IFLT(pivot, a[M])
-                        R = M;
-                    else
-                        L = M + 1;
-                }
+            }
+            // Binary Insertion
+            while (aL < aR) {
+                M = (aL + aR) >> 1;
+                IFLT(pivot, a[M])
+                    aR = M;
+                else
+                    aL = M + 1;
+            }
+            assert(aL == aR);
 
-                for (M = ok; M > L; --M)
-                    a[M] = a[M - 1];
-                a[L] = pivot;
-                if (has_values) {
-                    pivot = v[ok];
-                    for (M = ok; M > L; --M)
-                        v[M] = v[M - 1];
-                    v[L] = pivot;
-                }
+            for (M = ok; M > aL; --M)
+                a[M] = a[M - 1];
+            a[aL] = pivot;
+            if (has_values) {
+                pivot = v[ok];
+                for (M = ok; M > aL; --M)
+                    v[M] = v[M - 1];
+                v[aL] = pivot;
+            }
 
-                std += labs(L - mu);
-                std /= 2;
-                std_max += !(ok % 4);
-                if (std > std_max) {
-                    ++ok;
-                    break;
-                }
+            std += labs(aL - mu);
+            std /= 2;
+            std_max += !(ok % 4);
+            mu = aL;
+        }
+    }
+
+    // 4. Finish off with non-adaptive sort
+#endif  // End of adaptivity
+
+
+    /* Regular insertion sort has average- and worst-case O(n**2) cost
+       for both # of comparisons and number of bytes moved. But its branches
+       are highly predictable, and it loves sorted input (n-1 compares and no
+       data movement). This is significant in cases like sortperf.py's %sort,
+       where an out-of-order element near the start of a run is moved into
+       place slowly but then the remaining elements up to length minrun are
+       generally at worst one slot away from their correct position (so only
+       need 1 or 2 commpares to resolve). If comparisons are very fast (such
+       as for a list of Python floats), the simple inner loop leaves it
+       very competitive with binary insertion, despite that it does
+       significantly more compares overall on random data.
+
+       Binary insertion sort has worst, average, and best case O(n log n)
+       cost for # of comparisons, but worst and average case O(n**2) cost
+       for data movement. The more expensive comparisons, the more important
+       the comparison advantage. But its branches are less predictable the
+       more "randomish" the data, and that's so significant its worst case
+       in real life is random input rather than reverse-ordered (which does
+       about twice the data movement than random input does).
+
+       Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN
+       of 64 is so small that the key and value pointers all fit in a corner
+       of L1 cache, and moving things around in that is very fast. */
+#if 0 // ordinary insertion sort.
+    PyObject * vpivot = NULL;
+    for (; ok < n; ++ok) {
+        pivot = a[ok];
+        if (has_values)
+            vpivot = v[ok];
+        for (M = ok - 1; M >= 0; --M) {
+            k = ISLT(pivot, a[M]);
+            if (k < 0) {
+                a[M + 1] = pivot;
+                if (has_values)
+                    v[M + 1] = vpivot;
+                goto fail;
             }
+            else if (k) {
+                a[M + 1] = a[M];
+                if (has_values)
+                    v[M + 1] = v[M];
+            }
+            else
+                break;
         }
+        a[M + 1] = pivot;
+        if (has_values)
+            v[M + 1] = vpivot;
     }
-    // 4. Finish off with simple binary
+#else // binary insertion sort
+    Py_ssize_t L, R;
     for (; ok < n; ++ok) {
+        /* set L to where a[ok] belongs */
         L = 0;
         R = ok;
         pivot = a[ok];
-
+        /* Slice invariants. vacuously true at the start:
+         * all a[0:L]  <= pivot
+         * all a[L:R]     unknown
+         * all a[R:ok]  > pivot
+         */
         assert(L < R);
         do {
+            /* don't do silly ;-) things to prevent overflow when finding
+               the midpoint; L and R are very far from filling a Py_ssize_t */
             M = (L + R) >> 1;
+#if 1 // straightforward, but highly unpredictable branch on random data
             IFLT(pivot, a[M])
                 R = M;
             else
                 L = M + 1;
+#else
+            /* Try to get compiler to generate conditional move instructions
+               instead. Works fine, but leaving it disabled for now because
+               it's not yielding consistently faster sorts. Needs more
+               investigation. More computation in the inner loop adds its own
+               costs, which can be significant when compares are fast. */
+            k = ISLT(pivot, a[M]);
+            if (k < 0)
+                goto fail;
+            Py_ssize_t Mp1 = M + 1;
+            R = k ? M : R;
+            L = k ? L : Mp1;
+#endif
         } while (L < R);
         assert(L == R);
-
+        /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
+           at index L. Slide a[L:ok] to the right a slot to make room for it.
+           Caution: using memmove is much slower under MSVC 5; we're not
+           usually moving many slots. Years later: under Visual Studio 2022,
+           memmove seems just slightly slower than doing it "by hand". */
         for (M = ok; M > L; --M)
             a[M] = a[M - 1];
         a[L] = pivot;
@@ -2079,6 +2027,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             v[L] = pivot;
         }
     }
+#endif // pick binary or regular insertion sort
     return 0;
 
  fail:
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 5b2fc7d50a25ca..27ae6b8946b64d 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -836,6 +836,10 @@ So cutting the number of compares is almost always measurably helpful in
 CPython, and the savings swamp the quadratic-time data movement costs for
 reasonable minrun values.
 
+Additionally, "binary insertion sort" has implemented adaptivity procedure,
+which reduces the number of comparisons for cases where data is already
+sorted to high degree in either forward or reversed order.
+
 
 LEFT OR RIGHT
 gallop_left() and gallop_right() are akin to the Python bisect module's

From e2e2ec551b3738f5b92b50f56c6431ff2b76370b Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sun, 12 Oct 2025 12:19:41 +0300
Subject: [PATCH 06/14] noop

---
 Objects/listobject.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 481b40dea95837..102fc4aab8b155 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1837,7 +1837,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             }
 
             std += labs(aL - mu);
-            std /= 2;
+            std /= 2;    // EWMA with alpha=0.5
             mu = aL;
         }
 
@@ -1916,7 +1916,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             }
 
             std += labs(aL - mu);
-            std /= 2;
+            std /= 2;    // EWMA with alpha=0.5
             std_max += !(ok % 4);
             mu = aL;
         }

From 1641d5d95f71ead0ba056a7622a167283adecc99 Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Sun, 12 Oct 2025 09:44:42 +0000
Subject: [PATCH 07/14] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?=
 =?UTF-8?q?lurb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../2025-10-12-09-44-38.gh-issue-138946.VFqsnO.rst              | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-10-12-09-44-38.gh-issue-138946.VFqsnO.rst

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-12-09-44-38.gh-issue-138946.VFqsnO.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-12-09-44-38.gh-issue-138946.VFqsnO.rst
new file mode 100644
index 00000000000000..f300767326bee4
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-12-09-44-38.gh-issue-138946.VFqsnO.rst
@@ -0,0 +1,2 @@
+Sorting is now faster for cases where data is already
+sorted to high degree in either forward or reversed order.

From f34aa8763fb2537b671804b2ee34cbfa9b6d9415 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sun, 12 Oct 2025 20:53:44 +0300
Subject: [PATCH 08/14] macros for repetitions

---
 Objects/listobject.c | 92 +++++++++++++++++---------------------------
 1 file changed, 35 insertions(+), 57 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 102fc4aab8b155..e69349a36e9f51 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1753,6 +1753,31 @@ struct s_MergeState {
      Py_ssize_t mr_current, mr_e, mr_mask;
 };
 
+#define _binarysort_BISECT(L, R)    \
+    do {                            \
+        do {                        \
+            M = (L + R) >> 1;       \
+            IFLT(pivot, a[M])       \
+                R = M;              \
+            else                    \
+                L = M + 1;          \
+        } while (L < R);            \
+        assert(L == R);             \
+    } while (0)
+
+#define _binarysort_INSORT(idx, tmp)            \
+    do {                                        \
+        for (tmp = ok; tmp > idx; --tmp)        \
+            a[tmp] = a[tmp - 1];                \
+        a[idx] = pivot;                         \
+        if (has_values) {                       \
+            pivot = v[ok];                      \
+            for (tmp = ok; tmp > idx; --tmp)    \
+                v[tmp] = v[tmp - 1];            \
+            v[idx] = pivot;                     \
+        }                                       \
+    } while (0)
+
 /* binarysort is the best method for sorting small arrays: it does few
    compares, but can do data movement quadratic in the number of elements.
    ss->keys is viewed as an array of n kays, a[:n]. a[:ok] is already sorted.
@@ -1786,24 +1811,10 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
     Py_ssize_t aL = 0;
     Py_ssize_t aR = ok - 1;
     pivot = a[ok];
+
     assert(aL < aR);
-    do {
-        M = (aL + aR) >> 1;
-        IFLT(pivot, a[M])
-            aR = M;
-        else
-            aL = M + 1;
-    } while (aL < aR);
-    assert(aL == aR);
-    for (M = ok; M > aL; --M)
-        a[M] = a[M - 1];
-    a[aL] = pivot;
-    if (has_values) {
-        pivot = v[ok];
-        for (M = ok; M > aL; --M)
-            v[M] = v[M - 1];
-        v[aL] = pivot;
-    }
+    _binarysort_BISECT(aL, aR);
+    _binarysort_INSORT(aL, M);
     ++ok;
 
     Py_ssize_t m = ok < 5 ? 11 : ok + 6;
@@ -1817,24 +1828,8 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             pivot = a[ok];
 
             assert(aL < aR);
-            do {
-                M = (aL + aR) >> 1;
-                IFLT(pivot, a[M])
-                    aR = M;
-                else
-                    aL = M + 1;
-            } while (aL < aR);
-            assert(aL == aR);
-
-            for (M = ok; M > aL; --M)
-                a[M] = a[M - 1];
-            a[aL] = pivot;
-            if (has_values) {
-                pivot = v[ok];
-                for (M = ok; M > aL; --M)
-                    v[M] = v[M - 1];
-                v[aL] = pivot;
-            }
+            _binarysort_BISECT(aL, aR);
+            _binarysort_INSORT(aL, M);
 
             std += labs(aL - mu);
             std /= 2;    // EWMA with alpha=0.5
@@ -1850,14 +1845,13 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
                 aL = 0;
                 aR = mu;
                 if (aL < aR) {
-                    std += !std;
-                    M = aR - std;
+                    M = aR - 1 - std;
                     if (M < aL)
                         M = aL;
                     IFLT(pivot, a[M]) {
                         aR = M;
                         if (aL < aR) {
-                            M = aR - std;
+                            M = aR - 1 - std;
                             if (M < aL)
                                 M = aL;
                             IFLT(pivot, a[M])
@@ -1904,16 +1898,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
                     aL = M + 1;
             }
             assert(aL == aR);
-
-            for (M = ok; M > aL; --M)
-                a[M] = a[M - 1];
-            a[aL] = pivot;
-            if (has_values) {
-                pivot = v[ok];
-                for (M = ok; M > aL; --M)
-                    v[M] = v[M - 1];
-                v[aL] = pivot;
-            }
+            _binarysort_INSORT(aL, M);
 
             std += labs(aL - mu);
             std /= 2;    // EWMA with alpha=0.5
@@ -2017,15 +2002,8 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
            Caution: using memmove is much slower under MSVC 5; we're not
            usually moving many slots. Years later: under Visual Studio 2022,
            memmove seems just slightly slower than doing it "by hand". */
-        for (M = ok; M > L; --M)
-            a[M] = a[M - 1];
-        a[L] = pivot;
-        if (has_values) {
-            pivot = v[ok];
-            for (M = ok; M > L; --M)
-                v[M] = v[M - 1];
-            v[L] = pivot;
-        }
+
+        _binarysort_INSORT(L, M);
     }
 #endif // pick binary or regular insertion sort
     return 0;

From 181fef4e01ad96319e2671ec6288632b00be1999 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sun, 12 Oct 2025 20:56:06 +0300
Subject: [PATCH 09/14] remove macro from original code

---
 Objects/listobject.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index e69349a36e9f51..e728f3add6a783 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -2002,8 +2002,15 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
            Caution: using memmove is much slower under MSVC 5; we're not
            usually moving many slots. Years later: under Visual Studio 2022,
            memmove seems just slightly slower than doing it "by hand". */
-
-        _binarysort_INSORT(L, M);
+        for (M = ok; M > L; --M)
+            a[M] = a[M - 1];
+        a[L] = pivot;
+        if (has_values) {
+            pivot = v[ok];
+            for (M = ok; M > L; --M)
+                v[M] = v[M - 1];
+            v[L] = pivot;
+        }
     }
 #endif // pick binary or regular insertion sort
     return 0;

From a135ec80db4b6d30358e3d8e3faa54bb48870df5 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sun, 12 Oct 2025 22:20:58 +0300
Subject: [PATCH 10/14] replace labs with ternary

---
 Objects/listobject.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index e728f3add6a783..b6fa56eaf01228 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1831,7 +1831,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             _binarysort_BISECT(aL, aR);
             _binarysort_INSORT(aL, M);
 
-            std += labs(aL - mu);
+            std += mu < aL ? aL - mu : mu - aL;
             std /= 2;    // EWMA with alpha=0.5
             mu = aL;
         }
@@ -1900,7 +1900,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             assert(aL == aR);
             _binarysort_INSORT(aL, M);
 
-            std += labs(aL - mu);
+            std += mu < aL ? aL - mu : mu - aL;
             std /= 2;    // EWMA with alpha=0.5
             std_max += !(ok % 4);
             mu = aL;

From b5a0dc62b7762b36b9ed7075d75acd2c90b65723 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Sun, 12 Oct 2025 22:54:17 +0300
Subject: [PATCH 11/14] noop

---
 Objects/listobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index b6fa56eaf01228..ae4d7a1284dae4 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1902,8 +1902,8 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
 
             std += mu < aL ? aL - mu : mu - aL;
             std /= 2;    // EWMA with alpha=0.5
-            std_max += !(ok % 4);
             mu = aL;
+            std_max += !(ok % 4);
         }
     }
 

From ef3a870eec828a320ca40d8c42057561112329c4 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Mon, 13 Oct 2025 11:04:05 +0300
Subject: [PATCH 12/14] revert back to labs

---
 Objects/listobject.c | 54 ++++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index ae4d7a1284dae4..0757659b1012e4 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1755,28 +1755,24 @@ struct s_MergeState {
 
 #define _binarysort_BISECT(L, R)    \
     do {                            \
-        do {                        \
-            M = (L + R) >> 1;       \
-            IFLT(pivot, a[M])       \
-                R = M;              \
-            else                    \
-                L = M + 1;          \
-        } while (L < R);            \
-        assert(L == R);             \
-    } while (0)
-
-#define _binarysort_INSORT(idx, tmp)            \
-    do {                                        \
-        for (tmp = ok; tmp > idx; --tmp)        \
-            a[tmp] = a[tmp - 1];                \
-        a[idx] = pivot;                         \
-        if (has_values) {                       \
-            pivot = v[ok];                      \
-            for (tmp = ok; tmp > idx; --tmp)    \
-                v[tmp] = v[tmp - 1];            \
-            v[idx] = pivot;                     \
-        }                                       \
-    } while (0)
+        M = (L + R) >> 1;           \
+        IFLT(pivot, a[M])           \
+            R = M;                  \
+        else                        \
+            L = M + 1;              \
+    } while (L < R);                \
+    assert(L == R);
+
+#define _binarysort_INSORT(idx, tmp)        \
+    for (tmp = ok; tmp > idx; --tmp)        \
+        a[tmp] = a[tmp - 1];                \
+    a[idx] = pivot;                         \
+    if (has_values) {                       \
+        pivot = v[ok];                      \
+        for (tmp = ok; tmp > idx; --tmp)    \
+            v[tmp] = v[tmp - 1];            \
+        v[idx] = pivot;                     \
+    }
 
 /* binarysort is the best method for sorting small arrays: it does few
    compares, but can do data movement quadratic in the number of elements.
@@ -1813,8 +1809,8 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
     pivot = a[ok];
 
     assert(aL < aR);
-    _binarysort_BISECT(aL, aR);
-    _binarysort_INSORT(aL, M);
+    _binarysort_BISECT(aL, aR)
+    _binarysort_INSORT(aL, M)
     ++ok;
 
     Py_ssize_t m = ok < 5 ? 11 : ok + 6;
@@ -1828,10 +1824,10 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             pivot = a[ok];
 
             assert(aL < aR);
-            _binarysort_BISECT(aL, aR);
-            _binarysort_INSORT(aL, M);
+            _binarysort_BISECT(aL, aR)
+            _binarysort_INSORT(aL, M)
 
-            std += mu < aL ? aL - mu : mu - aL;
+            std += labs(aL - mu);
             std /= 2;    // EWMA with alpha=0.5
             mu = aL;
         }
@@ -1898,9 +1894,9 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
                     aL = M + 1;
             }
             assert(aL == aR);
-            _binarysort_INSORT(aL, M);
+            _binarysort_INSORT(aL, M)
 
-            std += mu < aL ? aL - mu : mu - aL;
+            std += labs(aL - mu);
             std /= 2;    // EWMA with alpha=0.5
             mu = aL;
             std_max += !(ok % 4);

From 3de148ff41c05abaa06800be2cf50138c2b83a54 Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Mon, 13 Oct 2025 11:35:28 +0300
Subject: [PATCH 13/14] docs

---
 Misc/ACKS            |  1 +
 Objects/listobject.c | 27 ++++++++++++++++++++-------
 Objects/listsort.txt |  6 ++++++
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/Misc/ACKS b/Misc/ACKS
index 0812b229e0ada4..ef071176297edc 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -685,6 +685,7 @@ Eddy De Greef
 Duane Griffin
 Grant Griffin
 Andrea Griffini
+Dominykas Grigonis
 Semyon Grigoryev
 Duncan Grisby
 Olivier Grisel
diff --git a/Objects/listobject.c b/Objects/listobject.c
index 0757659b1012e4..31908b2ee9e88d 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1801,7 +1801,8 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
         ++ok;
 
 #if 1   // Adaptivity with post `count_run` optimization of 1st pivot
-    // 1. Known: a[ok] < a[ok - 1], as called after `count_run`
+    /* 1. Known: a[ok] < a[ok - 1], as called after `count_run`
+          This just insorts first element taking that into account */
     if (ok >= n)
         return 0;
     Py_ssize_t aL = 0;
@@ -1815,7 +1816,11 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
 
     Py_ssize_t m = ok < 5 ? 11 : ok + 6;
     if (m < n) {
-        // 2. Small non-adaptive run to acquire good `std` estimate
+        /* 2. Small non-adaptive run to acquire good `std` estimate
+              Number of iterations (m) is chosen heuristically
+                and is subject to further calibration if needed.
+              It does minimum 6 iterations and up to 10 if pre-sorted part
+                is small as estimates of small integers are less reliable. */
         Py_ssize_t mu = aL;
         Py_ssize_t std = ok >> 1;
         for (; ok < m; ++ok) {
@@ -1832,7 +1837,15 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             mu = aL;
         }
 
-        // 3. Adaptive routine while `std` is small enough
+        /* 3. Adaptive routine while `std` is small enough
+              Take the last insertion point as the first midpoint
+                and do 2 subsequent step of size `std` trying to capture
+                the range into which new value falls in, potentially
+                locating insertion point faster than standard `binarysort`.
+              Continue until `std` (step size) is lower than
+                (size of sorted part) / 4.
+              If estimate from (2) is initially not small enough,
+                this does not execute a single time. */
         Py_ssize_t std_max = ok >> 2;
         for (; ok < n && std <= std_max; ++ok) {
             pivot = a[ok];
@@ -1885,7 +1898,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
                     }
                 }
             }
-            // Binary Insertion
+            // Simple Binary Insertion Sort
             while (aL < aR) {
                 M = (aL + aR) >> 1;
                 IFLT(pivot, a[M])
@@ -1897,13 +1910,13 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             _binarysort_INSORT(aL, M)
 
             std += labs(aL - mu);
-            std /= 2;    // EWMA with alpha=0.5
+            std /= 2;               // EWMA with alpha=0.5
             mu = aL;
-            std_max += !(ok % 4);
+            std_max += !(ok % 4);   // Keep approximately equal to: ok / 4
         }
     }
 
-    // 4. Finish off with non-adaptive sort
+    // 4. Finish with non-adaptive sort
 #endif  // End of adaptivity
 
 
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 27ae6b8946b64d..a57ab34554c57a 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -839,6 +839,12 @@ reasonable minrun values.
 Additionally, "binary insertion sort" has implemented adaptivity procedure,
 which reduces the number of comparisons for cases where data is already
 sorted to high degree in either forward or reversed order.
+While "binary insertion sort" ensures optimal number of comparisons
+it looses best case of textbook insertion sort when data is highly sorted
+in correct order. Adaptivity addition brings that back and more at small cost.
+It adapts to any data where position of next element is close to the one of
+last element. Thus, it adapts to cases where it is highly sorted in correct
+order, reverse order or elements are being funneled into some mid-point.
 
 
 LEFT OR RIGHT

From 70d5702e6a432f2b7ea2e9da168799f7e6f1085a Mon Sep 17 00:00:00 2001
From: "d.grigonis" <d.grigonis@me.com>
Date: Mon, 13 Oct 2025 11:48:37 +0300
Subject: [PATCH 14/14] docs

---
 Objects/listsort.txt | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index a57ab34554c57a..0f87da65907414 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -836,15 +836,21 @@ So cutting the number of compares is almost always measurably helpful in
 CPython, and the savings swamp the quadratic-time data movement costs for
 reasonable minrun values.
 
-Additionally, "binary insertion sort" has implemented adaptivity procedure,
-which reduces the number of comparisons for cases where data is already
-sorted to high degree in either forward or reversed order.
-While "binary insertion sort" ensures optimal number of comparisons
-it looses best case of textbook insertion sort when data is highly sorted
-in correct order. Adaptivity addition brings that back and more at small cost.
-It adapts to any data where position of next element is close to the one of
-last element. Thus, it adapts to cases where it is highly sorted in correct
+Additionally, "binary insertion sort" has implemented adaptivity procedure.
+While "binary insertion sort" ensures optimal number of comparisons,
+it looses best case of "textbook insertion sort" when data is highly sorted
+in correct order. Adaptivity brings that back (and more) at a small cost.
+It adapts to any data where position of the next element is close to the one of
+the last. Thus, it adapts to cases where it is highly sorted in correct
 order, reverse order or elements are being funneled into some mid-point.
+For example, it adapts well to sequence [-4, 4, -3, 3, -2, 2, -1, 1],
+for which it only does ~2n comparisons. It needs to compare twice for
+each element as being in the middle it needs to compare with elements at
+both sides.
+Also, it is worth noting that driftless random walk is the threshold
+beyond which adaptivity starts paying off. Thus, in practice, adaptivity
+has little effect when sorting raw stock prices, where drift value is
+very small in comparison to fluctuations around it.
 
 
 LEFT OR RIGHT