Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ Other enhancements
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
- :meth:`DataFrame.corr` now uses two pass Welford's Method to improve numerical stability with precision for very large/small values (:issue:`59652`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it exists. You are simply using two-pass.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Must have conflated the two, will update

- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
- Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`)
Expand Down
56 changes: 30 additions & 26 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
float64_t[:, ::1] result
uint8_t[:, :] mask
int64_t nobs = 0
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy, val
float64_t vx, vy, meanx, meany, divisor, ssqdmx, ssqdmy, cxy, val
float64_t sumx, sumy

N, K = (<object>mat).shape
if minp is None:
Expand All @@ -358,39 +359,42 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):

with nogil:
for xi in range(K):
for yi in range(xi + 1):
for yi in range(xi+1):
# Welford's method for the variance-calculation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
# Changed to Welford's two-pass for improved numeric stability
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is misleading.

nobs = ssqdmx = ssqdmy = cxy = meanx = meany = 0
sumx = sumy = 0
for i in range(N):
if mask[i, xi] and mask[i, yi]:
vx = mat[i, xi]
vy = mat[i, yi]
sumx += mat[i, xi]
sumy += mat[i, yi]
nobs += 1
dx = vx - meanx
dy = vy - meany
meanx += 1. / nobs * dx
meany += 1. / nobs * dy
ssqdmx += (vx - meanx) * dx
ssqdmy += (vy - meany) * dy
covxy += (vx - meanx) * dy

if nobs < minpv:
result[xi, yi] = result[yi, xi] = NaN
continue
meanx = sumx / nobs
meany = sumy / nobs
for i in range(N):
if mask[i, xi] and mask[i, yi]:
vx = mat[i, xi] - meanx
vy = mat[i, yi] - meany
Comment on lines +380 to +381
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of the reasons the Welford's algorithm is considered a "stable" algorithm is by mitigating cancellation in $x_i - \bar{x}$ by using a running average.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with the current implementation is the accumulation of round off errors. The Kahan Summation can mitigate this issue.

cxy += vx * vy
ssqdmx += vx * vx
ssqdmy += vy * vy
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)

# clip `covxy / divisor` to ensure coeff is within bounds
if divisor != 0:
val = cxy / divisor
if not cov:
if val > 1.0:
val = 1.0
elif val < -1.0:
val = -1.0
result[xi, yi] = result[yi, xi] = val
else:
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)

# clip `covxy / divisor` to ensure coeff is within bounds
if divisor != 0:
val = covxy / divisor
if not cov:
if val > 1.0:
val = 1.0
elif val < -1.0:
val = -1.0
result[xi, yi] = result[yi, xi] = val
else:
result[xi, yi] = result[yi, xi] = NaN
result[xi, yi] = result[yi, xi] = NaN

return result.base

Expand Down
19 changes: 16 additions & 3 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,11 +210,15 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method):
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
def test_corr_for_constant_columns(self, length):
# GH: 37448
# now matches numpy behavior
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
result = df.corr()
expected = DataFrame(
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
)
if length == 2:
expected = DataFrame(
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
)
else:
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
tm.assert_frame_equal(result, expected)

def test_calc_corr_small_numbers(self):
Expand Down Expand Up @@ -493,3 +497,12 @@ def test_cov_with_missing_values(self):
result2 = df.dropna().cov()
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)

def test_close_corr(self):
values = np.array(
[[30.0, 30.100000381469727], [116.80000305175781, 116.8000030517578]]
)
df = DataFrame(values.T)
result = df.corr(method="pearson")
expected = DataFrame(np.corrcoef(values[0], values[1]))
tm.assert_frame_equal(result, expected)
Loading