Skip to content

Commit 88c276a

Browse files
authored
BUG: fix polluted window in skewness computation (#62863)
1 parent 65a2a74 commit 88c276a

File tree

3 files changed

+130
-110
lines changed

3 files changed

+130
-110
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,6 +1255,7 @@ Groupby/resample/rolling
12551255
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
12561256
- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`)
12571257
- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
1258+
- Bug in :meth:`Rolling.skew` incorrectly computing skewness for windows following outliers due to numerical instability. The calculation now properly handles catastrophic cancellation by recomputing affected windows (:issue:`47461`)
12581259
- Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`)
12591260
- Bug in :meth:`Series.resample` raising error when resampling non-nanosecond resolutions out of bounds for nanosecond precision (:issue:`57427`)
12601261

pandas/_libs/window/aggregations.pyx

Lines changed: 102 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# cython: boundscheck=False, wraparound=False, cdivision=True
22

33
from libc.math cimport (
4+
fabs,
45
round,
56
signbit,
67
sqrt,
@@ -60,6 +61,12 @@ cdef:
6061
float64_t MAXfloat64 = np.inf
6162

6263
float64_t NaN = <float64_t>np.nan
64+
float64_t EpsF64 = np.finfo(np.float64).eps
65+
66+
# Consider an operation ill-conditioned if
67+
# it will only have up to 3 significant digits in base 10 remaining.
68+
# https://en.wikipedia.org/wiki/Condition_number
69+
float64_t InvCondTol = EpsF64 * 1e3
6370

6471
cdef bint is_monotonic_increasing_start_end_bounds(
6572
ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end
@@ -482,75 +489,74 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
482489

483490

484491
cdef float64_t calc_skew(int64_t minp, int64_t nobs,
485-
float64_t x, float64_t xx, float64_t xxx,
492+
float64_t mean, float64_t m2, float64_t m3,
486493
int64_t num_consecutive_same_value
487494
) noexcept nogil:
488495
cdef:
489496
float64_t result, dnobs
490-
float64_t A, B, C, R
497+
float64_t moments_ratio, correction
491498

492499
if nobs >= minp:
493500
dnobs = <float64_t>nobs
494-
A = x / dnobs
495-
B = xx / dnobs - A * A
496-
C = xxx / dnobs - A * A * A - 3 * A * B
497501

498502
if nobs < 3:
499503
result = NaN
500504
# GH 42064 46431
501505
# uniform case, force result to be 0
502506
elif num_consecutive_same_value >= nobs:
503507
result = 0.0
504-
# #18044: with uniform distribution, floating issue will
505-
# cause B != 0. and cause the result is a very
508+
# #18044: with degenerate distribution, floating issue will
509+
# cause m2 != 0. and cause the result is a very
506510
# large number.
507511
#
508512
# in core/nanops.py nanskew/nankurt call the function
509513
# _zero_out_fperr(m2) to fix floating error.
510514
# if the variance is less than 1e-14, it could be
511515
# treat as zero, here we follow the original
512-
# skew/kurt behaviour to check B <= 1e-14
513-
elif B <= 1e-14:
516+
# skew/kurt behaviour to check m2 <= n * 1e-14
517+
elif m2 <= dnobs * 1e-14:
514518
result = NaN
515519
else:
516-
R = sqrt(B)
517-
result = ((sqrt(dnobs * (dnobs - 1.)) * C) /
518-
((dnobs - 2) * R * R * R))
520+
moments_ratio = m3 / (m2 * sqrt(m2))
521+
correction = dnobs * sqrt((dnobs - 1)) / (dnobs - 2)
522+
result = moments_ratio * correction
519523
else:
520524
result = NaN
521525

522526
return result
523527

524528

525529
cdef void add_skew(float64_t val, int64_t *nobs,
526-
float64_t *x, float64_t *xx,
527-
float64_t *xxx,
528-
float64_t *compensation_x,
529-
float64_t *compensation_xx,
530-
float64_t *compensation_xxx,
530+
float64_t *mean, float64_t *m2,
531+
float64_t *m3,
532+
bint *numerically_unstable,
531533
int64_t *num_consecutive_same_value,
532534
float64_t *prev_value,
533535
) noexcept nogil:
534536
""" add a value from the skew calc """
535537
cdef:
536-
float64_t y, t
538+
float64_t n, delta, delta_n, term1, m3_update, new_m3
539+
540+
# Formulas adapted from
541+
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics
537542

538543
# Not NaN
539544
if val == val:
540-
nobs[0] = nobs[0] + 1
541-
542-
y = val - compensation_x[0]
543-
t = x[0] + y
544-
compensation_x[0] = t - x[0] - y
545-
x[0] = t
546-
y = val * val - compensation_xx[0]
547-
t = xx[0] + y
548-
compensation_xx[0] = t - xx[0] - y
549-
xx[0] = t
550-
y = val * val * val - compensation_xxx[0]
551-
t = xxx[0] + y
552-
compensation_xxx[0] = t - xxx[0] - y
553-
xxx[0] = t
545+
nobs[0] += 1
546+
n = <float64_t>(nobs[0])
547+
delta = val - mean[0]
548+
delta_n = delta / n
549+
term1 = delta * delta_n * (n - 1.0)
550+
551+
m3_update = delta_n * (term1 * (n - 2.0) - 3.0 * m2[0])
552+
new_m3 = m3[0] + m3_update
553+
if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3):
554+
# possible catastrophic cancellation
555+
numerically_unstable[0] = True
556+
557+
m3[0] = new_m3
558+
m2[0] += term1
559+
mean[0] += delta_n
554560

555561
# GH#42064, record num of same values to remove floating point artifacts
556562
if val == prev_value[0]:
@@ -562,116 +568,112 @@ cdef void add_skew(float64_t val, int64_t *nobs,
562568

563569

564570
cdef void remove_skew(float64_t val, int64_t *nobs,
565-
float64_t *x, float64_t *xx,
566-
float64_t *xxx,
567-
float64_t *compensation_x,
568-
float64_t *compensation_xx,
569-
float64_t *compensation_xxx) noexcept nogil:
571+
float64_t *mean, float64_t *m2,
572+
float64_t *m3,
573+
bint *numerically_unstable) noexcept nogil:
570574
""" remove a value from the skew calc """
571575
cdef:
572-
float64_t y, t
576+
float64_t n, delta, delta_n, term1, m3_update, new_m3
577+
578+
# This is the online update for the central moments
579+
# when we remove an observation.
580+
#
581+
# δ = x - m_{n+1}
582+
# m_{n} = m_{n+1} - (δ / n)
583+
# m²_n = Σ_{i=1}^{n+1}(x_i - m_{n})² - (x - m_{n})² # uses new mean
584+
# = m²_{n+1} - (δ²/n)*(n+1)
585+
# m³_n = Σ_{i=1}^{n+1}(x_i - m_{n})³ - (x - m_{n})³ # uses new mean
586+
# = m³_{n+1} - (δ³/n²)*(n+1)*(n+2) + 3 * m²_{n+1}*(δ/n)
573587

574588
# Not NaN
575589
if val == val:
576-
nobs[0] = nobs[0] - 1
590+
nobs[0] -= 1
591+
n = <float64_t>(nobs[0])
592+
delta = val - mean[0]
593+
delta_n = delta / n
594+
term1 = delta_n * delta * (n + 1.0)
577595

578-
y = - val - compensation_x[0]
579-
t = x[0] + y
580-
compensation_x[0] = t - x[0] - y
581-
x[0] = t
582-
y = - val * val - compensation_xx[0]
583-
t = xx[0] + y
584-
compensation_xx[0] = t - xx[0] - y
585-
xx[0] = t
586-
y = - val * val * val - compensation_xxx[0]
587-
t = xxx[0] + y
588-
compensation_xxx[0] = t - xxx[0] - y
589-
xxx[0] = t
596+
m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0])
597+
new_m3 = m3[0] - m3_update
598+
599+
if (fabs(m3_update) + fabs(m3[0])) * InvCondTol > fabs(new_m3):
600+
# possible catastrophic cancellation
601+
numerically_unstable[0] = True
602+
603+
m3[0] = new_m3
604+
m2[0] -= term1
605+
mean[0] -= delta_n
590606

591607

592-
def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
608+
def roll_skew(const float64_t[:] values, ndarray[int64_t] start,
593609
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
594610
cdef:
595611
Py_ssize_t i, j
596-
float64_t val, min_val, mean_val, sum_val = 0
597-
float64_t compensation_xxx_add, compensation_xxx_remove
598-
float64_t compensation_xx_add, compensation_xx_remove
599-
float64_t compensation_x_add, compensation_x_remove
600-
float64_t x, xx, xxx
612+
float64_t val
613+
float64_t mean, m2, m3
601614
float64_t prev_value
602-
int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
615+
int64_t nobs = 0, N = len(start)
603616
int64_t s, e, num_consecutive_same_value
604-
ndarray[float64_t] output, values_copy
617+
ndarray[float64_t] output
605618
bint is_monotonic_increasing_bounds
619+
bint requires_recompute, numerically_unstable = False
606620

607621
minp = max(minp, 3)
608622
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
609623
start, end
610624
)
611625
output = np.empty(N, dtype=np.float64)
612-
min_val = np.nanmin(values)
613-
values_copy = np.copy(values)
614626

615627
with nogil:
616-
for i in range(0, V):
617-
val = values_copy[i]
618-
if val == val:
619-
nobs_mean += 1
620-
sum_val += val
621-
mean_val = sum_val / nobs_mean
622-
# Other cases would lead to imprecision for smallest values
623-
if min_val - mean_val > -1e5:
624-
mean_val = round(mean_val)
625-
for i in range(0, V):
626-
values_copy[i] = values_copy[i] - mean_val
627-
628628
for i in range(0, N):
629629

630630
s = start[i]
631631
e = end[i]
632632

633633
# Over the first window, observations can only be added
634634
# never removed
635-
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
636-
637-
prev_value = values[s]
638-
num_consecutive_same_value = 0
639-
640-
compensation_xxx_add = compensation_xxx_remove = 0
641-
compensation_xx_add = compensation_xx_remove = 0
642-
compensation_x_add = compensation_x_remove = 0
643-
x = xx = xxx = 0
644-
nobs = 0
645-
for j in range(s, e):
646-
val = values_copy[j]
647-
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
648-
&compensation_xx_add, &compensation_xxx_add,
649-
&num_consecutive_same_value, &prev_value)
650-
651-
else:
635+
requires_recompute = (
636+
i == 0
637+
or not is_monotonic_increasing_bounds
638+
or s >= end[i - 1]
639+
)
652640

641+
if not requires_recompute:
653642
# After the first window, observations can both be added
654643
# and removed
655644
# calculate deletes
656645
for j in range(start[i - 1], s):
657-
val = values_copy[j]
658-
remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove,
659-
&compensation_xx_remove, &compensation_xxx_remove)
646+
val = values[j]
647+
remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
660648

661649
# calculate adds
662650
for j in range(end[i - 1], e):
663-
val = values_copy[j]
664-
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
665-
&compensation_xx_add, &compensation_xxx_add,
651+
val = values[j]
652+
add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable,
666653
&num_consecutive_same_value, &prev_value)
667654

668-
output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)
655+
if requires_recompute or numerically_unstable:
656+
657+
prev_value = values[s]
658+
num_consecutive_same_value = 0
659+
660+
mean = m2 = m3 = 0.0
661+
nobs = 0
662+
663+
for j in range(s, e):
664+
val = values[j]
665+
add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable,
666+
&num_consecutive_same_value, &prev_value)
667+
668+
numerically_unstable = False
669+
670+
output[i] = calc_skew(minp, nobs, mean, m2, m3, num_consecutive_same_value)
669671

670672
if not is_monotonic_increasing_bounds:
671673
nobs = 0
672-
x = 0.0
673-
xx = 0.0
674-
xxx = 0.0
674+
mean = 0.0
675+
m2 = 0.0
676+
m3 = 0.0
675677

676678
return output
677679

pandas/tests/window/test_rolling.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1175,7 +1175,9 @@ def test_rolling_decreasing_indices(method):
11751175
increasing = getattr(df.rolling(window=5), method)()
11761176
decreasing = getattr(df_reverse.rolling(window=5), method)()
11771177

1178-
assert np.abs(decreasing.values[::-1][:-4] - increasing.values[4:]).max() < 1e-12
1178+
tm.assert_almost_equal(
1179+
decreasing.values[::-1][:-4], increasing.values[4:], atol=1e-12
1180+
)
11791181

11801182

11811183
@pytest.mark.parametrize(
@@ -1441,17 +1443,30 @@ def test_rolling_skew_kurt_numerical_stability(method):
14411443

14421444

14431445
@pytest.mark.parametrize(
1444-
("method", "values"),
1446+
("method", "data", "values"),
14451447
[
1446-
("skew", [2.0, 0.854563, 0.0, 1.999984]),
1447-
("kurt", [4.0, -1.289256, -1.2, 3.999946]),
1448+
(
1449+
"skew",
1450+
[3000000, 1, 1, 2, 3, 4, 999],
1451+
[np.nan] * 3 + [2.0, 0.854563, 0.0, 1.999984],
1452+
),
1453+
(
1454+
"skew",
1455+
[1e6, -1e6, 1, 2, 3, 4, 5, 6],
1456+
[np.nan] * 3 + [-5.51135192e-06, -2.0, 0.0, 0.0, 0.0],
1457+
),
1458+
(
1459+
"kurt",
1460+
[3000000, 1, 1, 2, 3, 4, 999],
1461+
[np.nan] * 3 + [4.0, -1.289256, -1.2, 3.999946],
1462+
),
14481463
],
14491464
)
1450-
def test_rolling_skew_kurt_large_value_range(method, values):
1451-
# GH: 37557
1452-
s = Series([3000000, 1, 1, 2, 3, 4, 999])
1465+
def test_rolling_skew_kurt_large_value_range(method, data, values):
1466+
# GH: 37557, 47461
1467+
s = Series(data)
14531468
result = getattr(s.rolling(4), method)()
1454-
expected = Series([np.nan] * 3 + values)
1469+
expected = Series(values)
14551470
tm.assert_series_equal(result, expected)
14561471

14571472

@@ -1837,9 +1852,11 @@ def test_rolling_skew_kurt_floating_artifacts():
18371852
sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
18381853
r = sr.rolling(4)
18391854
result = r.skew()
1840-
assert (result[-2:] == 0).all()
1855+
expected = Series([np.nan, np.nan, np.nan, 1.9619045191072484, 2.0, 0.0, 0.0])
1856+
tm.assert_series_equal(result, expected)
18411857
result = r.kurt()
1842-
assert (result[-2:] == -3).all()
1858+
expected = Series([np.nan, np.nan, np.nan, 3.8636048803878786, 4.0, -3.0, -3.0])
1859+
tm.assert_series_equal(result, expected)
18431860

18441861

18451862
def test_numeric_only_frame(arithmetic_win_operators, numeric_only):

0 commit comments

Comments
 (0)