diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 448ceffdaa1eb..b7256023bc66c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -992,6 +992,7 @@ Numeric ^^^^^^^ - Bug in :func:`api.types.infer_dtype` returning "mixed" for complex and ``pd.NA`` mix (:issue:`61976`) - Bug in :func:`api.types.infer_dtype` returning "mixed-integer-float" for float and ``pd.NA`` mix (:issue:`61621`) +- Bug in :meth:`DataFrame.combine` and :meth:`DataFrame.combine_first` where integers with absolute value greater than ``2**53`` could lose precision after the operation. (:issue:`60128`) - Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`) - Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`) - Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c41b82bbbc8e..b64bc5d786ed3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -144,6 +144,10 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.integer import ( + Int64Dtype, + UInt64Dtype, +) from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( @@ -9025,6 +9029,67 @@ def combine( 1 0.0 3.0 1.0 2 NaN 3.0 1.0 """ + + # GH#60128 Integers n where |n| > 2**53 would lose precision after align + # upcasts them to float. Avoid lossy conversion by preemptively promoting + # int64 and uint64 to their nullable ExtensionDtypes, Int64 and UInt64. + def _ensure_nullable_int64_dtypes(df: DataFrame) -> DataFrame: + """Promote int64/uint64 DataFrame columns to Int64/UInt64.""" + cast_map: dict[IndexLabel, DtypeObj] = {} + for col, dt in df.dtypes.items(): + if dt == np.int64: + cast_map[col] = Int64Dtype() + elif dt == np.uint64: + cast_map[col] = UInt64Dtype() + + if cast_map: + df = df.astype(cast_map) + return df + + # To maintain backwards compatibility, downcast the pre-promoted int64 + # columns of the combined DataFrame back to how they would have resolved. + # Consider just embracing nullable ExtensionDtypes instead, though. + def _revert_int64_dtype_promotion( + self_orig: DataFrame, other_orig: DataFrame, combined_df: DataFrame + ) -> DataFrame: + """Resolve the combined dtypes according to the original dtypes.""" + cast_map: dict[IndexLabel, DtypeObj] = {} + for col in combined_df.columns: + ser = combined_df[col] + orig_dt_self = self_orig.dtypes.get(col) + orig_dt_other = other_orig.dtypes.get(col) + + was_promoted = (orig_dt_self in [np.int64, np.uint64]) or ( + orig_dt_other in [np.int64, np.uint64] + ) + + if was_promoted: + dtypes_to_resolve = [ + dt for dt in (orig_dt_self, orig_dt_other) if dt is not None + ] + if dtypes_to_resolve: + if isna(ser).any(): + # If there are NAs, we can't safely downcast back + # to int. Previously, we left the data as float64. + # However, converting large integers to float can + # lose precision, even if it's not immediately + # obvious (since we don't cast back). Consider + # embracing nullable ExtensionDtypes instead + # and dropping this whole restoration step. + dtypes_to_resolve.append(np.dtype(np.float64)) + target_type = find_common_type(dtypes_to_resolve) + cast_map[col] = target_type + + if cast_map: + combined_df = combined_df.astype(cast_map) + return combined_df + + # store originals and prepare for align + self_orig = self + other_orig = other + self = _ensure_nullable_int64_dtypes(self) + other = _ensure_nullable_int64_dtypes(other) + other_idxlen = len(other.index) # save for compare other_columns = other.columns @@ -9092,6 +9157,9 @@ def combine( # convert_objects just in case frame_result = self._constructor(result, index=new_index, columns=new_columns) + frame_result = _revert_int64_dtype_promotion( + self_orig, other_orig, frame_result + ) return frame_result.__finalize__(self, method="combine") def combine_first(self, other: DataFrame) -> DataFrame: @@ -9141,20 +9209,10 @@ def combine_first(self, other: DataFrame) -> DataFrame: 1 0.0 3.0 1.0 2 NaN 3.0 1.0 """ - from pandas.core.computation import expressions def combiner(x: Series, y: Series): - mask = x.isna()._values - - x_values = x._values - y_values = y._values - - # If the column y in other DataFrame is not in first DataFrame, - # just return y_values. - if y.name not in self.columns: - return y_values - - return expressions.where(mask, y_values, x_values) + # GH#60128 The combiner is supposed to preserve EA Dtypes. + return y if y.name not in self.columns else y.where(x.isna(), x) if len(other) == 0: combined = self.reindex( diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 1e594043510ea..a837f66dc8762 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -398,6 +398,21 @@ def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): ).set_index(["a", "b"]) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "wide_val, dtype", + ( + (1666880195890293744, "uint64"), + (-1666880195890293744, "int64"), + ), + ) + def test_combine_first_preserve_precision(self, wide_val, dtype): + # GH#60128 + df1 = DataFrame({"A": [wide_val, 5]}, dtype=dtype) + df2 = DataFrame({"A": [6, 7, wide_val]}, dtype=dtype) + result = df1.combine_first(df2) + expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "scalar1, scalar2",