From 7401a368d5a8c6c8191b1151063c7521d8f8c3e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Fri, 24 Oct 2025 19:03:25 -0300 Subject: [PATCH 01/21] test: rewrite test to allow better parametrization --- pandas/tests/window/test_rolling.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 18aafa0d7b71e..2515e4959e52e 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1438,17 +1438,25 @@ def test_rolling_skew_kurt_numerical_stability(method): @pytest.mark.parametrize( - ("method", "values"), + "method, data, values", [ - ("skew", [2.0, 0.854563, 0.0, 1.999984]), - ("kurt", [4.0, -1.289256, -1.2, 3.999946]), + ( + "skew", + [3000000, 1, 1, 2, 3, 4, 999], + [np.nan] * 3 + [2.0, 0.854563, 0.0, 1.999984], + ), + ( + "kurt", + [3000000, 1, 1, 2, 3, 4, 999], + [np.nan] * 3 + [4.0, -1.289256, -1.2, 3.999946], + ), ], ) -def test_rolling_skew_kurt_large_value_range(method, values): +def test_rolling_skew_kurt_large_value_range(method, data, values): # GH: 37557 - s = Series([3000000, 1, 1, 2, 3, 4, 999]) + s = Series(data) result = getattr(s.rolling(4), method)() - expected = Series([np.nan] * 3 + values) + expected = Series(values) tm.assert_series_equal(result, expected) From 23345cff57c83dd4ae94e0db65a349277f593718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 25 Oct 2025 02:01:55 -0300 Subject: [PATCH 02/21] fix(rolling.skew): handle outliers in window This change uses online update for the mean instead of computing the centralized values. Additionally, it checks for possible catastrophic cancellation by big changes in 3rd central moment. --- pandas/_libs/window/aggregations.pyx | 198 ++++++++++----------------- pandas/tests/window/test_rolling.py | 11 +- 2 files changed, 85 insertions(+), 124 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 0c8ea28b60ce8..77055d5cfe984 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1,6 +1,7 @@ # cython: boundscheck=False, wraparound=False, cdivision=True from libc.math cimport ( + fabs, round, signbit, sqrt, @@ -482,40 +483,32 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, cdef float64_t calc_skew(int64_t minp, int64_t nobs, - float64_t x, float64_t xx, float64_t xxx, - int64_t num_consecutive_same_value + float64_t mean, float64_t m2, float64_t m3, ) noexcept nogil: cdef: float64_t result, dnobs - float64_t A, B, C, R + float64_t moments_ratio, correction if nobs >= minp: dnobs = nobs - A = x / dnobs - B = xx / dnobs - A * A - C = xxx / dnobs - A * A * A - 3 * A * B if nobs < 3: result = NaN - # GH 42064 46431 - # uniform case, force result to be 0 - elif num_consecutive_same_value >= nobs: - result = 0.0 - # #18044: with uniform distribution, floating issue will - # cause B != 0. and cause the result is a very + # #18044: with degenerate distribution, floating issue will + # cause m2 != 0. and cause the result is a very # large number. # # in core/nanops.py nanskew/nankurt call the function # _zero_out_fperr(m2) to fix floating error. # if the variance is less than 1e-14, it could be # treat as zero, here we follow the original - # skew/kurt behaviour to check B <= 1e-14 - elif B <= 1e-14: + # skew/kurt behaviour to check m2 < n * (eps * eps * mean * mean) + elif m2 < dnobs * (1e-28 * mean * mean if fabs(mean) > 1e-14 else 1e-14): result = NaN else: - R = sqrt(B) - result = ((sqrt(dnobs * (dnobs - 1.)) * C) / - ((dnobs - 2) * R * R * R)) + moments_ratio = m3 / (m2 * sqrt(m2)) + correction = dnobs * sqrt((dnobs - 1)) / (dnobs - 2) + result = moments_ratio * correction else: result = NaN @@ -523,155 +516,116 @@ cdef float64_t calc_skew(int64_t minp, int64_t nobs, cdef void add_skew(float64_t val, int64_t *nobs, - float64_t *x, float64_t *xx, - float64_t *xxx, - float64_t *compensation_x, - float64_t *compensation_xx, - float64_t *compensation_xxx, - int64_t *num_consecutive_same_value, - float64_t *prev_value, + float64_t *mean, float64_t *m2, + float64_t *m3, + bint *numerically_unstable ) noexcept nogil: """ add a value from the skew calc """ cdef: - float64_t y, t + float64_t n, delta, delta_n, term1, m3_update, new_m3 # Not NaN if val == val: - nobs[0] = nobs[0] + 1 + nobs[0] += 1 + n = (nobs[0]) + delta = val - mean[0] + delta_n = delta / n + term1 = delta * delta_n * (n - 1.0) - y = val - compensation_x[0] - t = x[0] + y - compensation_x[0] = t - x[0] - y - x[0] = t - y = val * val - compensation_xx[0] - t = xx[0] + y - compensation_xx[0] = t - xx[0] - y - xx[0] = t - y = val * val * val - compensation_xxx[0] - t = xxx[0] + y - compensation_xxx[0] = t - xxx[0] - y - xxx[0] = t + m3_update = delta_n * (term1 * (n - 2.0) - 3.0 * m2[0]) + new_m3 = m3[0] + m3_update + if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): + # possible catastrophic cancellation + numerically_unstable[0] = True - # GH#42064, record num of same values to remove floating point artifacts - if val == prev_value[0]: - num_consecutive_same_value[0] += 1 - else: - # reset to 1 (include current value itself) - num_consecutive_same_value[0] = 1 - prev_value[0] = val + m3[0] = new_m3 + m2[0] += term1 + mean[0] += delta_n cdef void remove_skew(float64_t val, int64_t *nobs, - float64_t *x, float64_t *xx, - float64_t *xxx, - float64_t *compensation_x, - float64_t *compensation_xx, - float64_t *compensation_xxx) noexcept nogil: + float64_t *mean, float64_t *m2, + float64_t *m3, + bint *numerically_unstable) noexcept nogil: """ remove a value from the skew calc """ cdef: - float64_t y, t + float64_t n, delta, delta_n, term1, m3_update, new_m3 # Not NaN if val == val: - nobs[0] = nobs[0] - 1 + nobs[0] -= 1 + n = (nobs[0]) + delta = val - mean[0] + delta_n = delta / n + term1 = delta_n * delta * (n + 1.0) - y = - val - compensation_x[0] - t = x[0] + y - compensation_x[0] = t - x[0] - y - x[0] = t - y = - val * val - compensation_xx[0] - t = xx[0] + y - compensation_xx[0] = t - xx[0] - y - xx[0] = t - y = - val * val * val - compensation_xxx[0] - t = xxx[0] + y - compensation_xxx[0] = t - xxx[0] - y - xxx[0] = t + m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0]) + new_m3 = m3[0] - m3_update + + if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): + # possible catastrophic cancellation + numerically_unstable[0] = True + + m3[0] = new_m3 + m2[0] -= term1 + mean[0] -= delta_n def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: Py_ssize_t i, j - float64_t val, min_val, mean_val, sum_val = 0 - float64_t compensation_xxx_add, compensation_xxx_remove - float64_t compensation_xx_add, compensation_xx_remove - float64_t compensation_x_add, compensation_x_remove - float64_t x, xx, xxx - float64_t prev_value - int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0 - int64_t s, e, num_consecutive_same_value - ndarray[float64_t] output, values_copy - bint is_monotonic_increasing_bounds + float64_t val + float64_t mean, m2, m3 + int64_t nobs = 0, N = len(start) + int64_t s, e + ndarray[float64_t] output + bint requires_recompute, numerically_unstable = False minp = max(minp, 3) is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( start, end ) output = np.empty(N, dtype=np.float64) - min_val = np.nanmin(values) - values_copy = np.copy(values) with nogil: - for i in range(0, V): - val = values_copy[i] - if val == val: - nobs_mean += 1 - sum_val += val - mean_val = sum_val / nobs_mean - # Other cases would lead to imprecision for smallest values - if min_val - mean_val > -1e5: - mean_val = round(mean_val) - for i in range(0, V): - values_copy[i] = values_copy[i] - mean_val - for i in range(0, N): - s = start[i] e = end[i] - # Over the first window, observations can only be added - # never removed - if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: - - prev_value = values[s] - num_consecutive_same_value = 0 - - compensation_xxx_add = compensation_xxx_remove = 0 - compensation_xx_add = compensation_xx_remove = 0 - compensation_x_add = compensation_x_remove = 0 - x = xx = xxx = 0 - nobs = 0 - for j in range(s, e): - val = values_copy[j] - add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, - &compensation_xx_add, &compensation_xxx_add, - &num_consecutive_same_value, &prev_value) - - else: + requires_recompute = ( + i == 0 + or not is_monotonic_increasing_bounds + or s >= end[i - 1] + or numerically_unstable + ) - # After the first window, observations can both be added - # and removed - # calculate deletes + if not requires_recompute: for j in range(start[i - 1], s): - val = values_copy[j] - remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove, - &compensation_xx_remove, &compensation_xxx_remove) + val = values[j] + remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) # calculate adds for j in range(end[i - 1], e): - val = values_copy[j] - add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, - &compensation_xx_add, &compensation_xxx_add, - &num_consecutive_same_value, &prev_value) + val = values[j] + add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) - output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value) + if requires_recompute or numerically_unstable: + numerically_unstable = False + mean = m2 = m3 = 0.0 + nobs = 0 + + for j in range(s, e): + val = values[j] + add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) + + output[i] = calc_skew(minp, nobs, mean, m2, m3) if not is_monotonic_increasing_bounds: nobs = 0 - x = 0.0 - xx = 0.0 - xxx = 0.0 + mean = 0.0 + m2 = 0.0 + m3 = 0.0 return output diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2515e4959e52e..6154600ebac81 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1172,7 +1172,9 @@ def test_rolling_decreasing_indices(method): increasing = getattr(df.rolling(window=5), method)() decreasing = getattr(df_reverse.rolling(window=5), method)() - assert np.abs(decreasing.values[::-1][:-4] - increasing.values[4:]).max() < 1e-12 + tm.assert_almost_equal( + decreasing.values[::-1][:-4], increasing.values[4:], atol=1e-12 + ) @pytest.mark.parametrize( @@ -1438,13 +1440,18 @@ def test_rolling_skew_kurt_numerical_stability(method): @pytest.mark.parametrize( - "method, data, values", + ("method", "data", "values"), [ ( "skew", [3000000, 1, 1, 2, 3, 4, 999], [np.nan] * 3 + [2.0, 0.854563, 0.0, 1.999984], ), + ( + "skew", + [1e6, -1e6, 1, 2, 3, 4, 5, 6], + [np.nan] * 3 + [-5.51135192e-06, -2.0, 0.0, 0.0, 0.0], + ), ( "kurt", [3000000, 1, 1, 2, 3, 4, 999], From 488e99e5b8b9109b22f02de3b5721da4db8fd2e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 25 Oct 2025 23:21:34 -0300 Subject: [PATCH 03/21] test: change tested behavior --- pandas/tests/window/test_rolling.py | 7 +++++-- pandas/tests/window/test_rolling_skew_kurt.py | 9 ++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 6154600ebac81..8c6f91ce10767 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1843,15 +1843,18 @@ def test_rolling_mean_sum_floating_artifacts(): assert (result[-3:] == 0).all() +@pytest.mark.xfail(reason="Series.rolling.kurt computes -3 to degenerate distribution") def test_rolling_skew_kurt_floating_artifacts(): # GH 42064 46431 sr = Series([1 / 3, 4, 0, 0, 0, 0, 0]) r = sr.rolling(4) result = r.skew() - assert (result[-2:] == 0).all() + expected = Series([np.nan, np.nan, np.nan, 1.9619045191072484, 2.0, np.nan, np.nan]) + tm.assert_series_equal(result, expected) result = r.kurt() - assert (result[-2:] == -3).all() + expected = Series([np.nan, np.nan, np.nan, 3.8636048803878786, 4.0, np.nan, np.nan]) + tm.assert_series_equal(result, expected) def test_numeric_only_frame(arithmetic_win_operators, numeric_only): diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py index 79c14f243e7cc..00ced86f293cb 100644 --- a/pandas/tests/window/test_rolling_skew_kurt.py +++ b/pandas/tests/window/test_rolling_skew_kurt.py @@ -170,11 +170,11 @@ def test_center_reindex_frame(frame, roll_func): def test_rolling_skew_edge_cases(step): - expected = Series([np.nan] * 4 + [0.0])[::step] + expected = Series([np.nan] * 5)[::step] # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5, step=step).skew() - # index 4 should be 0 as it contains 5 same obs + # index 4 should be NaN as it contains 5 same obs (variance 0) tm.assert_series_equal(expected, x) expected = Series([np.nan] * 5)[::step] @@ -213,10 +213,9 @@ def test_rolling_kurt_edge_cases(step): def test_rolling_skew_eq_value_fperr(step): # #18804 all rolling skew for all equal values should return Nan - # #46717 update: all equal values should return 0 instead of NaN a = Series([1.1] * 15).rolling(window=10, step=step).skew() - assert (a[a.index >= 9] == 0).all() - assert a[a.index < 9].isna().all() + expected = Series([np.nan] * 15)[::step] + tm.assert_series_equal(expected, a) def test_rolling_kurt_eq_value_fperr(step): From ff9108d0df0905da6bb51c633a986630cf72886c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 26 Oct 2025 14:00:12 -0300 Subject: [PATCH 04/21] docs(whatsnew): add bufix entry --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cc93494645af6..5faca3276cfb4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1165,6 +1165,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) +- Bug in :meth:`Rolling.skew` incorrectly computing skewness for windows following outliers due to numerical instability. The calculation now properly handles catastrophic cancellation by recomputing affected windows (:issue:`47461`) - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) Reshaping From a2d093737d016935d3364898776616ac69b8631a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 26 Oct 2025 14:13:22 -0300 Subject: [PATCH 05/21] docs(whatsnew): add change for degenerate distribution --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5faca3276cfb4..0123a090a1ecc 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1166,6 +1166,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Rolling.skew` incorrectly computing skewness for windows following outliers due to numerical instability. The calculation now properly handles catastrophic cancellation by recomputing affected windows (:issue:`47461`) +- Bug in :meth:`Rolling.skew` incorrectly returning ``0.0`` for constant-value windows. Now returns ``NaN`` to match the statistical definition of skewness for degenerate distributions - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) Reshaping From 8020107e7b5149df4887b79b0f72f9ca13a74a91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 26 Oct 2025 15:00:37 -0300 Subject: [PATCH 06/21] docs: link issue --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0123a090a1ecc..4ec3caaa7eb4d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1166,7 +1166,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Rolling.skew` incorrectly computing skewness for windows following outliers due to numerical instability. The calculation now properly handles catastrophic cancellation by recomputing affected windows (:issue:`47461`) -- Bug in :meth:`Rolling.skew` incorrectly returning ``0.0`` for constant-value windows. Now returns ``NaN`` to match the statistical definition of skewness for degenerate distributions +- Bug in :meth:`Rolling.skew` incorrectly returning ``0.0`` for constant-value windows. Now returns ``NaN`` to match the statistical definition of skewness for degenerate distributions (:issue:`62864`) - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) Reshaping From edc23932276f75bcebf2283950039436e4052274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 26 Oct 2025 20:42:28 -0300 Subject: [PATCH 07/21] perf: recompute the window less --- pandas/_libs/window/aggregations.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 77055d5cfe984..11f81a9a4504e 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -611,7 +611,6 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) if requires_recompute or numerically_unstable: - numerically_unstable = False mean = m2 = m3 = 0.0 nobs = 0 @@ -619,6 +618,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, val = values[j] add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) + numerically_unstable = False + output[i] = calc_skew(minp, nobs, mean, m2, m3) if not is_monotonic_increasing_bounds: From ae1f3f917653f1a8be8934a41e212408ecbbe5e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 27 Oct 2025 16:07:52 -0300 Subject: [PATCH 08/21] perf: use fma for `m3_update` --- pandas/_libs/window/aggregations.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 11f81a9a4504e..25a82b245f74f 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -2,6 +2,7 @@ from libc.math cimport ( fabs, + fma, round, signbit, sqrt, @@ -532,7 +533,7 @@ cdef void add_skew(float64_t val, int64_t *nobs, delta_n = delta / n term1 = delta * delta_n * (n - 1.0) - m3_update = delta_n * (term1 * (n - 2.0) - 3.0 * m2[0]) + m3_update = delta_n * fma(term1, n - 2.0, -3.0 * m2[0]) new_m3 = m3[0] + m3_update if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): # possible catastrophic cancellation @@ -559,7 +560,7 @@ cdef void remove_skew(float64_t val, int64_t *nobs, delta_n = delta / n term1 = delta_n * delta * (n + 1.0) - m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0]) + m3_update = delta_n * fma(term1, n + 2.0, -3.0 * m2[0]) new_m3 = m3[0] - m3_update if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): From 5cc8b6014bc8183ba9da617c6655da3ca8477108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 27 Oct 2025 16:21:44 -0300 Subject: [PATCH 09/21] chore: add formulas for update --- pandas/_libs/window/aggregations.pyx | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 25a82b245f74f..a15263afe6e7c 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -525,6 +525,9 @@ cdef void add_skew(float64_t val, int64_t *nobs, cdef: float64_t n, delta, delta_n, term1, m3_update, new_m3 + # This formulas are adapted from + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics + # Not NaN if val == val: nobs[0] += 1 @@ -552,6 +555,16 @@ cdef void remove_skew(float64_t val, int64_t *nobs, cdef: float64_t n, delta, delta_n, term1, m3_update, new_m3 + # This is the online update for the central moments + # when we remove an observation. + # + # δ = x - m_{n+1} + # m_{n} = m_{n+1} - (δ / n) + # m²_n = Σ_{i=1}^{n+1}(x_i - m_{n})² - (x - m_{n})² # uses new mean + # = m²_{n+1} - (δ²/n)*(n+1) + # m³_n = Σ_{i=1}^{n+1}(x_i - m_{n})³ - (x - m_{n})³ # uses new mean + # = m³_{n+1} - (δ³/n²)*(n+1)*(n+2) + 3 * m²_{n+1}*(δ/n) + # Not NaN if val == val: nobs[0] -= 1 From 7f1d2a37cf777169ae638cf5860dedf31ddb02c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 27 Oct 2025 21:18:14 -0300 Subject: [PATCH 10/21] test(rolling): undo test modifications for degenerate windows --- pandas/tests/window/test_rolling.py | 5 ++--- pandas/tests/window/test_rolling_skew_kurt.py | 9 +++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 8c6f91ce10767..490af3c5ee45e 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1843,17 +1843,16 @@ def test_rolling_mean_sum_floating_artifacts(): assert (result[-3:] == 0).all() -@pytest.mark.xfail(reason="Series.rolling.kurt computes -3 to degenerate distribution") def test_rolling_skew_kurt_floating_artifacts(): # GH 42064 46431 sr = Series([1 / 3, 4, 0, 0, 0, 0, 0]) r = sr.rolling(4) result = r.skew() - expected = Series([np.nan, np.nan, np.nan, 1.9619045191072484, 2.0, np.nan, np.nan]) + expected = Series([np.nan, np.nan, np.nan, 1.9619045191072484, 2.0, 0.0, 0.0]) tm.assert_series_equal(result, expected) result = r.kurt() - expected = Series([np.nan, np.nan, np.nan, 3.8636048803878786, 4.0, np.nan, np.nan]) + expected = Series([np.nan, np.nan, np.nan, 3.8636048803878786, 4.0, -3.0, -3.0]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py index 00ced86f293cb..79c14f243e7cc 100644 --- a/pandas/tests/window/test_rolling_skew_kurt.py +++ b/pandas/tests/window/test_rolling_skew_kurt.py @@ -170,11 +170,11 @@ def test_center_reindex_frame(frame, roll_func): def test_rolling_skew_edge_cases(step): - expected = Series([np.nan] * 5)[::step] + expected = Series([np.nan] * 4 + [0.0])[::step] # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5, step=step).skew() - # index 4 should be NaN as it contains 5 same obs (variance 0) + # index 4 should be 0 as it contains 5 same obs tm.assert_series_equal(expected, x) expected = Series([np.nan] * 5)[::step] @@ -213,9 +213,10 @@ def test_rolling_kurt_edge_cases(step): def test_rolling_skew_eq_value_fperr(step): # #18804 all rolling skew for all equal values should return Nan + # #46717 update: all equal values should return 0 instead of NaN a = Series([1.1] * 15).rolling(window=10, step=step).skew() - expected = Series([np.nan] * 15)[::step] - tm.assert_series_equal(expected, a) + assert (a[a.index >= 9] == 0).all() + assert a[a.index < 9].isna().all() def test_rolling_kurt_eq_value_fperr(step): From 0aa2949735c84e914ee05fc3268430b5b2f6a88c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 27 Oct 2025 21:19:08 -0300 Subject: [PATCH 11/21] docs: remove note about nan change --- doc/source/whatsnew/v3.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4ec3caaa7eb4d..5faca3276cfb4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1166,7 +1166,6 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Rolling.skew` incorrectly computing skewness for windows following outliers due to numerical instability. The calculation now properly handles catastrophic cancellation by recomputing affected windows (:issue:`47461`) -- Bug in :meth:`Rolling.skew` incorrectly returning ``0.0`` for constant-value windows. Now returns ``NaN`` to match the statistical definition of skewness for degenerate distributions (:issue:`62864`) - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) Reshaping From 6f211676ee7055b0d2e3b130c3c2ed8efc3c9a84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 27 Oct 2025 21:23:04 -0300 Subject: [PATCH 12/21] fix: undo scale check for m2 --- pandas/_libs/window/aggregations.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index a15263afe6e7c..ddae3c362f867 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -503,8 +503,8 @@ cdef float64_t calc_skew(int64_t minp, int64_t nobs, # _zero_out_fperr(m2) to fix floating error. # if the variance is less than 1e-14, it could be # treat as zero, here we follow the original - # skew/kurt behaviour to check m2 < n * (eps * eps * mean * mean) - elif m2 < dnobs * (1e-28 * mean * mean if fabs(mean) > 1e-14 else 1e-14): + # skew/kurt behaviour to check m2 <= n * 1e-14 + elif m2 <= dnobs * 1e-14: result = NaN else: moments_ratio = m3 / (m2 * sqrt(m2)) From cca03e94cd92283ad87cf06d9efb98bbc7d926ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 27 Oct 2025 21:55:42 -0300 Subject: [PATCH 13/21] fix: undo NaN changes --- pandas/_libs/window/aggregations.pyx | 31 +++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index ddae3c362f867..de7f55542e345 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -485,6 +485,7 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, cdef float64_t calc_skew(int64_t minp, int64_t nobs, float64_t mean, float64_t m2, float64_t m3, + int64_t num_consecutive_same_value ) noexcept nogil: cdef: float64_t result, dnobs @@ -495,6 +496,10 @@ cdef float64_t calc_skew(int64_t minp, int64_t nobs, if nobs < 3: result = NaN + # GH 42064 46431 + # uniform case, force result to be 0 + elif num_consecutive_same_value >= nobs: + result = 0.0 # #18044: with degenerate distribution, floating issue will # cause m2 != 0. and cause the result is a very # large number. @@ -519,7 +524,9 @@ cdef float64_t calc_skew(int64_t minp, int64_t nobs, cdef void add_skew(float64_t val, int64_t *nobs, float64_t *mean, float64_t *m2, float64_t *m3, - bint *numerically_unstable + bint *numerically_unstable, + int64_t *num_consecutive_same_value, + float64_t *prev_value, ) noexcept nogil: """ add a value from the skew calc """ cdef: @@ -546,6 +553,14 @@ cdef void add_skew(float64_t val, int64_t *nobs, m2[0] += term1 mean[0] += delta_n + # GH#42064, record num of same values to remove floating point artifacts + if val == prev_value[0]: + num_consecutive_same_value[0] += 1 + else: + # reset to 1 (include current value itself) + num_consecutive_same_value[0] = 1 + prev_value[0] = val + cdef void remove_skew(float64_t val, int64_t *nobs, float64_t *mean, float64_t *m2, @@ -591,9 +606,11 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, Py_ssize_t i, j float64_t val float64_t mean, m2, m3 + float64_t prev_value int64_t nobs = 0, N = len(start) - int64_t s, e + int64_t s, e, num_consecutive_same_value ndarray[float64_t] output + bint is_monotonic_increasing_bounds bint requires_recompute, numerically_unstable = False minp = max(minp, 3) @@ -604,6 +621,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, with nogil: for i in range(0, N): + s = start[i] e = end[i] @@ -615,6 +633,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ) if not requires_recompute: + # calculate deletes for j in range(start[i - 1], s): val = values[j] remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) @@ -622,7 +641,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, # calculate adds for j in range(end[i - 1], e): val = values[j] - add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) + add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable, + &num_consecutive_same_value, &prev_value) if requires_recompute or numerically_unstable: mean = m2 = m3 = 0.0 @@ -630,11 +650,12 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, for j in range(s, e): val = values[j] - add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) + add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable, + &num_consecutive_same_value, &prev_value) numerically_unstable = False - output[i] = calc_skew(minp, nobs, mean, m2, m3) + output[i] = calc_skew(minp, nobs, mean, m2, m3, num_consecutive_same_value) if not is_monotonic_increasing_bounds: nobs = 0 From 6177d6aaec845c5261be99062d16b3a2f2e29ac4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 27 Oct 2025 22:25:24 -0300 Subject: [PATCH 14/21] chore: simplifies diff --- pandas/_libs/window/aggregations.pyx | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index de7f55542e345..9a82c4dff5e16 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -625,14 +625,17 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, s = start[i] e = end[i] + # Over the first window, observations can only be added + # never removed requires_recompute = ( - i == 0 - or not is_monotonic_increasing_bounds - or s >= end[i - 1] - or numerically_unstable + i == 0 + or not is_monotonic_increasing_bounds + or s >= end[i - 1] ) if not requires_recompute: + # After the first window, observations can both be added + # and removed # calculate deletes for j in range(start[i - 1], s): val = values[j] From 9d9451d2656a0a8a86e784fbd74dd9ec694a8edc Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Tue, 28 Oct 2025 21:14:20 -0300 Subject: [PATCH 15/21] fix: re-add consecutive count reset to recompute --- pandas/_libs/window/aggregations.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 9a82c4dff5e16..93d67bc265c59 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -648,6 +648,10 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, &num_consecutive_same_value, &prev_value) if requires_recompute or numerically_unstable: + + prev_value = values[s] + num_consecutive_same_value = 0 + mean = m2 = m3 = 0.0 nobs = 0 From 74aba02ad18eb0fe04797dc81ffe1d2147269104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 29 Oct 2025 10:11:59 -0300 Subject: [PATCH 16/21] chore: remove extra whitespace --- pandas/_libs/window/aggregations.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 93d67bc265c59..3eb3d75246f95 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -628,9 +628,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed requires_recompute = ( - i == 0 - or not is_monotonic_increasing_bounds - or s >= end[i - 1] + i == 0 + or not is_monotonic_increasing_bounds + or s >= end[i - 1] ) if not requires_recompute: From a1fb80080368381ddfafea8001735af83bb5f2c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Fri, 31 Oct 2025 19:16:13 -0300 Subject: [PATCH 17/21] chore: justify ill-condition tolerance --- pandas/_libs/window/aggregations.pyx | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 3eb3d75246f95..3fd88a0e82f1b 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -62,6 +62,12 @@ cdef: float64_t MAXfloat64 = np.inf float64_t NaN = np.nan + float64_t EpsF64 = np.finfo(np.float64).eps + + # Consider an operation ill-conditioned if + # it will only have up to 3 significant digits in base 10 remaining. + # https://en.wikipedia.org/wiki/Condition_number + float64_t InvConditionNumber = EpsF64 * 1e3 cdef bint is_monotonic_increasing_start_end_bounds( ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end @@ -532,7 +538,7 @@ cdef void add_skew(float64_t val, int64_t *nobs, cdef: float64_t n, delta, delta_n, term1, m3_update, new_m3 - # This formulas are adapted from + # Formulas adapted from # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics # Not NaN @@ -545,7 +551,7 @@ cdef void add_skew(float64_t val, int64_t *nobs, m3_update = delta_n * fma(term1, n - 2.0, -3.0 * m2[0]) new_m3 = m3[0] + m3_update - if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): + if (fabs(m3_update) + fabs(m3[0])) * InvConditionNumber > fabs(new_m3): # possible catastrophic cancellation numerically_unstable[0] = True @@ -591,7 +597,7 @@ cdef void remove_skew(float64_t val, int64_t *nobs, m3_update = delta_n * fma(term1, n + 2.0, -3.0 * m2[0]) new_m3 = m3[0] - m3_update - if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): + if (fabs(m3_update) + fabs(m3[0])) * InvConditionNumber > fabs(new_m3): # possible catastrophic cancellation numerically_unstable[0] = True From b6b68b90efc481357a089c4f23b0232161c8da44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 3 Nov 2025 12:02:50 -0300 Subject: [PATCH 18/21] chore: rename IncConditionNumber -> InvCondTol --- pandas/_libs/window/aggregations.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 3fd88a0e82f1b..bb09a28adcec1 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -67,7 +67,7 @@ cdef: # Consider an operation ill-conditioned if # it will only have up to 3 significant digits in base 10 remaining. # https://en.wikipedia.org/wiki/Condition_number - float64_t InvConditionNumber = EpsF64 * 1e3 + float64_t InvCondTol = EpsF64 * 1e3 cdef bint is_monotonic_increasing_start_end_bounds( ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end @@ -551,7 +551,7 @@ cdef void add_skew(float64_t val, int64_t *nobs, m3_update = delta_n * fma(term1, n - 2.0, -3.0 * m2[0]) new_m3 = m3[0] + m3_update - if (fabs(m3_update) + fabs(m3[0])) * InvConditionNumber > fabs(new_m3): + if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3): # possible catastrophic cancellation numerically_unstable[0] = True @@ -597,7 +597,7 @@ cdef void remove_skew(float64_t val, int64_t *nobs, m3_update = delta_n * fma(term1, n + 2.0, -3.0 * m2[0]) new_m3 = m3[0] - m3_update - if (fabs(m3_update) + fabs(m3[0])) * InvConditionNumber > fabs(new_m3): + if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3): # possible catastrophic cancellation numerically_unstable[0] = True From 4a095acaf149001de586f90f9717bf75ca179b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 3 Nov 2025 12:04:38 -0300 Subject: [PATCH 19/21] chore: add github issue to test --- pandas/tests/window/test_rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 5320ca9c67655..5b00aeed79db6 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1463,7 +1463,7 @@ def test_rolling_skew_kurt_numerical_stability(method): ], ) def test_rolling_skew_kurt_large_value_range(method, data, values): - # GH: 37557 + # GH: 37557, 47461 s = Series(data) result = getattr(s.rolling(4), method)() expected = Series(values) From 7f2a74874245fe0c5290e5b20c371af3018d5e73 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 3 Nov 2025 18:40:07 -0300 Subject: [PATCH 20/21] perf: stop using `fma` --- pandas/_libs/window/aggregations.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index bb09a28adcec1..93c145f470f2a 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -2,7 +2,6 @@ from libc.math cimport ( fabs, - fma, round, signbit, sqrt, @@ -549,7 +548,7 @@ cdef void add_skew(float64_t val, int64_t *nobs, delta_n = delta / n term1 = delta * delta_n * (n - 1.0) - m3_update = delta_n * fma(term1, n - 2.0, -3.0 * m2[0]) + m3_update = delta_n * (term1 * (n - 2.0) - 3.0 * m2[0]) new_m3 = m3[0] + m3_update if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3): # possible catastrophic cancellation @@ -594,7 +593,7 @@ cdef void remove_skew(float64_t val, int64_t *nobs, delta_n = delta / n term1 = delta_n * delta * (n + 1.0) - m3_update = delta_n * fma(term1, n + 2.0, -3.0 * m2[0]) + m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0]) new_m3 = m3[0] - m3_update if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3): From f5f0fed624e80bca961ba1f9207624fa3d1139c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 4 Nov 2025 15:56:03 -0300 Subject: [PATCH 21/21] fix: pass memview to `roll_skew` --- pandas/_libs/window/aggregations.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 93c145f470f2a..dccd93e8dafd9 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -605,7 +605,7 @@ cdef void remove_skew(float64_t val, int64_t *nobs, mean[0] -= delta_n -def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, +def roll_skew(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: Py_ssize_t i, j