Skip to content

Commit e15bde9

Browse files
make wide ints nullable before align and restore after combining
1 parent a6b461c commit e15bde9

File tree

1 file changed

+59
-20
lines changed

1 file changed

+59
-20
lines changed

pandas/core/frame.py

Lines changed: 59 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9025,6 +9025,52 @@ def combine(
90259025
1 0.0 3.0 1.0
90269026
2 NaN 3.0 1.0
90279027
"""
9028+
9029+
# GH#62691 Prevent lossy conversion of wide integers
9030+
# by proactively promoting them to their nullable versions
9031+
# because an outer align will force a round trip through float64.
9032+
def _promote_wide_ints(df: DataFrame) -> DataFrame:
9033+
"""Promotes int64/uint64 columns to their nullable versions."""
9034+
cast_map: dict[str, str] = {}
9035+
for col, dt in df.dtypes.items():
9036+
if dt == np.dtype("int64"):
9037+
cast_map[col] = "Int64"
9038+
elif dt == np.dtype("uint64"):
9039+
cast_map[col] = "UInt64"
9040+
9041+
if cast_map:
9042+
df = df.astype(cast_map)
9043+
return df
9044+
9045+
# store originals before promotion
9046+
self_original = self
9047+
other_original = other
9048+
self = _promote_wide_ints(self)
9049+
other = _promote_wide_ints(other)
9050+
9051+
def _restore_wide_ints(df: DataFrame):
9052+
"""Restores previously int64/uint64 columns if they don't have NAs."""
9053+
cast_map: dict[str, str] = {}
9054+
for col in df.columns:
9055+
ser = df[col]
9056+
orig_dt_self = self_original.dtypes.get(col)
9057+
orig_dt_other = other_original.dtypes.get(col)
9058+
9059+
is_at_risk = (orig_dt_self in [np.int64, np.uint64]) or (
9060+
orig_dt_other in [np.int64, np.uint64]
9061+
)
9062+
9063+
if is_at_risk and not isna(ser).any():
9064+
dtypes_to_resolve = [
9065+
dt for dt in (orig_dt_self, orig_dt_other) if dt is not None
9066+
]
9067+
if dtypes_to_resolve:
9068+
cast_map[col] = find_common_type(dtypes_to_resolve)
9069+
9070+
if cast_map:
9071+
df = df.astype(cast_map)
9072+
return df
9073+
90289074
other_idxlen = len(other.index) # save for compare
90299075
other_columns = other.columns
90309076

@@ -9092,6 +9138,7 @@ def combine(
90929138

90939139
# convert_objects just in case
90949140
frame_result = self._constructor(result, index=new_index, columns=new_columns)
9141+
frame_result = _restore_wide_ints(frame_result)
90959142
return frame_result.__finalize__(self, method="combine")
90969143

90979144
def combine_first(self, other: DataFrame) -> DataFrame:
@@ -9141,35 +9188,27 @@ def combine_first(self, other: DataFrame) -> DataFrame:
91419188
1 0.0 3.0 1.0
91429189
2 NaN 3.0 1.0
91439190
"""
9191+
from pandas.core.computation import expressions
91449192

91459193
def combiner(x: Series, y: Series):
9146-
# GH#60128 Preserve EA dtypes by operating at the Series level.
9147-
# If 'y' is a new column, return it as-is; otherwise fill <NA> in 'x'
9148-
# from 'y'. Avoids dropping to NumPy arrays (which would lose
9149-
# Int64/UInt64 and reintroduce float64 paths).
9150-
return y if y.name not in self.columns else y.where(x.isna(), x)
9194+
mask = x.isna()._values
9195+
9196+
x_values = x._values
9197+
y_values = y._values
9198+
9199+
# If the column y in other DataFrame is not in first DataFrame,
9200+
# just return y_values.
9201+
if y.name not in self.columns:
9202+
return y_values
9203+
9204+
return expressions.where(mask, y_values, x_values)
91519205

91529206
if len(other) == 0:
91539207
combined = self.reindex(
91549208
self.columns.append(other.columns.difference(self.columns)), axis=1
91559209
)
91569210
combined = combined.astype(other.dtypes)
91579211
else:
9158-
# GH#60128 Avoid precision loss from int64/uint64 <-> float64 round-trip.
9159-
def _promote_ints_to_nullable(df: DataFrame) -> DataFrame:
9160-
cast_map: dict[str, str] = {}
9161-
9162-
for col, dt in df.dtypes.items():
9163-
if dt == np.dtype("uint64"):
9164-
cast_map[col] = "UInt64"
9165-
elif dt == np.dtype("int64"):
9166-
cast_map[col] = "Int64"
9167-
9168-
return df.astype(cast_map) if cast_map else df
9169-
9170-
self = _promote_ints_to_nullable(self)
9171-
other = _promote_ints_to_nullable(other)
9172-
91739212
combined = self.combine(other, combiner, overwrite=False)
91749213

91759214
dtypes = {

0 commit comments

Comments
 (0)