From 99ae6728d46cb88b4543c40564876619eba3f5e7 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Sep 2025 07:44:20 -0700 Subject: [PATCH 01/15] ENH: fill_value in frame+series flex ops --- pandas/core/frame.py | 48 +++++++++++----------- pandas/tests/arithmetic/test_period.py | 17 ++------ pandas/tests/arrays/string_/test_string.py | 1 - pandas/tests/frame/test_arithmetic.py | 12 +++--- 4 files changed, 33 insertions(+), 45 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 91f5cd1679a61..524240ba3ac66 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8468,27 +8468,34 @@ def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): blockwise. """ rvalues = series._values - if not isinstance(rvalues, np.ndarray): - # TODO(EA2D): no need to special-case with 2D EAs - if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"): - # We can losslessly+cheaply cast to ndarray - rvalues = np.asarray(rvalues) + if lib.is_np_dtype(rvalues.dtype): + # We can losslessly+cheaply cast to ndarray + # i.e. ndarray or dt64[naive], td64 + # TODO(EA2D): no need to special case with 2D EAs + rvalues = np.asarray(rvalues) + + if axis == 0: + rvalues = rvalues.reshape(-1, 1) else: - return series + rvalues = rvalues.reshape(1, -1) - if axis == 0: - rvalues = rvalues.reshape(-1, 1) - else: - rvalues = rvalues.reshape(1, -1) + rvalues = np.broadcast_to(rvalues, self.shape) + # pass dtype to avoid doing inference + df = self._constructor(rvalues, dtype=rvalues.dtype) - rvalues = np.broadcast_to(rvalues, self.shape) - # pass dtype to avoid doing inference - return self._constructor( - rvalues, - index=self.index, - columns=self.columns, - dtype=rvalues.dtype, - ).__finalize__(series) + else: + # GH#61581 + if axis == 0: + df = DataFrame(dict.fromkeys(range(self.shape[1]), rvalues)) + else: + nrows = self.shape[0] + df = DataFrame( + {i: rvalues[[i]].repeat(nrows) for i in range(self.shape[1])}, + dtype=rvalues.dtype, + ) + df.index = self.index + df.columns = self.columns + return df.__finalize__(series) def _flex_arith_method( self, other, op, *, axis: Axis = "columns", level=None, fill_value=None @@ -8498,11 +8505,6 @@ def _flex_arith_method( if self._should_reindex_frame_op(other, op, axis, fill_value, level): return self._arith_method_with_reindex(other, op) - if isinstance(other, Series) and fill_value is not None: - # TODO: We could allow this in cases where we end up going - # through the DataFrame path - raise NotImplementedError(f"fill_value {fill_value} not supported.") - other = ops.maybe_prepare_scalar_for_op(other, self.shape) self, other = self._align_for_op(other, axis, flex=True, level=level) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 67762e0b89c73..305022878ce79 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1361,12 +1361,8 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - if box_with_array is pd.DataFrame: - # TODO: before implementing resolution-inference we got the same - # message with DataFrame and non-DataFrame. Why did that change? - msg = "cannot add PeriodArray and Timestamp" - else: - msg = "cannot add PeriodArray and DatetimeArray" + + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) with pytest.raises(TypeError, match=msg): @@ -1376,16 +1372,9 @@ def test_period_add_timestamp_raises(self, box_with_array): with pytest.raises(TypeError, match=msg): pd.Index([ts]) + arr - if box_with_array is pd.DataFrame: - msg = "cannot add PeriodArray and DatetimeArray" - else: - msg = r"unsupported operand type\(s\) for \+: 'Period' and 'DatetimeArray" + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + pd.DataFrame([ts]) - if box_with_array is pd.DataFrame: - msg = "cannot add PeriodArray and DatetimeArray" - else: - msg = r"unsupported operand type\(s\) for \+: 'DatetimeArray' and 'Period'" with pytest.raises(TypeError, match=msg): pd.DataFrame([ts]) + arr diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index f10ebda94dc6a..01db9a9b9e2e2 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -249,7 +249,6 @@ def test_mul(dtype): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(reason="GH-28527") def test_add_strings(dtype): arr = pd.array(["a", "b", "c", "d"], dtype=dtype) df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index a9a98a5005bb3..3ddc8c1d56820 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -626,11 +626,9 @@ def test_arith_flex_frame_corner(self, float_frame): expected = float_frame.sort_index() * np.nan tm.assert_frame_equal(result, expected) - with pytest.raises(NotImplementedError, match="fill_value"): - float_frame.add(float_frame.iloc[0], fill_value=3) + res = float_frame.add(float_frame.iloc[0], fill_value=3) - with pytest.raises(NotImplementedError, match="fill_value"): - float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) + res = float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) @pytest.mark.parametrize("op", ["add", "sub", "mul", "mod"]) def test_arith_flex_series_ops(self, simple_frame, op): @@ -672,11 +670,11 @@ def test_arith_flex_zero_len_raises(self): df_len0 = DataFrame(columns=["A", "B"]) df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - with pytest.raises(NotImplementedError, match="fill_value"): + msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'" + with pytest.raises(TypeError, match=msg): df.add(ser_len0, fill_value="E") - with pytest.raises(NotImplementedError, match="fill_value"): - df_len0.sub(df["A"], axis=None, fill_value=3) + df_len0.sub(df["A"], axis=None, fill_value=3) def test_flex_add_scalar_fill_value(self): # GH#12723 From 4e77fb77b051b6de7ba4a94d691586f842842f3b Mon Sep 17 00:00:00 2001 From: eicchen Date: Wed, 1 Oct 2025 15:28:43 -0500 Subject: [PATCH 02/15] Updated version of 62317 --- doc/source/whatsnew/v3.0.0.rst | 3 ++ pandas/core/arrays/arrow/array.py | 14 +++++- pandas/tests/arithmetic/test_period.py | 1 - .../tests/arrays/boolean/test_arithmetic.py | 2 +- .../tests/arrays/floating/test_arithmetic.py | 38 +++++++++++++- .../tests/arrays/integer/test_arithmetic.py | 20 ++++++++ pandas/tests/arrays/string_/test_string.py | 49 +++++++++++++++++-- 7 files changed, 118 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d721213dc38e7..e0757bbdb70d7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -200,6 +200,8 @@ Other enhancements - :class:`Holiday` has gained the constructor argument and field ``exclude_dates`` to exclude specific datetimes from a custom holiday calendar (:issue:`54382`) - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) +- :class:`StringDtype` now supports addition to Series/DataFrame with strings (:issue:`61581`) + - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) @@ -998,6 +1000,7 @@ MultiIndex - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) - Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) - Bug in :class:`DataFrame` arithmetic operations with :class:`Series` in case of unaligned MultiIndex (:issue:`61009`) +- Bug in :class:`DataFrame` arithmetic operations with :class:`Series` now works with ``fill_value`` parameter (:issue:`61581`) - Bug in :meth:`MultiIndex.from_tuples` causing wrong output with input of type tuples having NaN values (:issue:`60695`, :issue:`60988`) - Bug in :meth:`DataFrame.__setitem__` where column alignment logic would reindex the assigned value with an empty index, incorrectly setting all values to ``NaN``.(:issue:`61841`) - Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` where reindexing :class:`Index` to a :class:`MultiIndex` would incorrectly set all values to ``NaN``.(:issue:`60923`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2eed608908440..29e7c21caced2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -890,7 +890,14 @@ def _op_method_error_message(self, other, op) -> str: def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type other_original = other - other = self._box_pa(other) + try: + other = self._box_pa(other) + except (ValueError, pa.lib.ArrowTypeError) as err: + # Categorical and Interval dtype raises errors in self._box_pa + # Could be fixed in the future if needed + raise TypeError( + "Incompatible type when converting to PyArrow dtype for operation." + ) from err if ( pa.types.is_string(pa_type) @@ -899,6 +906,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) + if not (is_scalar(other) or isinstance(other, pa.Scalar)): + if len(other) == 0 or isna(other).any(): + other = other.cast(pa_type) + elif isna(other): + other = other.cast(pa_type) try: if op is operator.add: result = pc.binary_join_element_wise(self._pa_array, other, sep) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 305022878ce79..04587243b5b07 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1361,7 +1361,6 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 312dfb72e0950..555e69dc82589 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -118,7 +118,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): ops(pd.Timestamp("20180101")) # invalid array-likes - if op not in ("__mul__", "__rmul__"): + if op not in ("__mul__", "__rmul__", "__add__", "__radd__"): # TODO(extension) numpy's mul with object array sees booleans as numbers msg = "|".join( [ diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 777099e76fc73..fc2cf1a385d42 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -152,8 +152,42 @@ def test_error_invalid_values(data, all_arithmetic_operators): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(TypeError, match=msg): - ops(pd.Series("foo", index=s.index)) + str_ser = pd.Series("foo", index=s.index) + if ( + all_arithmetic_operators + in [ + "__add__", + "__radd__", + ] + and pd.options.future.infer_string + ): + res = ops(str_ser) + if all_arithmetic_operators == "__radd__": + data_expected = [] + for i in data: + if pd.isna(i): + data_expected.append(i) + elif i.is_integer(): + data_expected.append("foo" + str(int(i))) + else: + data_expected.append("foo" + str(i)) + + expected = pd.Series(data_expected, index=s.index) + else: + data_expected = [] + for i in data: + if pd.isna(i): + data_expected.append(i) + elif i.is_integer(): + data_expected.append(str(int(i)) + "foo") + else: + data_expected.append(str(i) + "foo") + + expected = pd.Series(data_expected, index=s.index) + tm.assert_series_equal(res, expected) + else: + with pytest.raises(TypeError, match=msg): + ops(str_ser) msg = "|".join( [ diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index aeceb9b8a3cb1..9aece48f2ea38 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -197,6 +197,26 @@ def test_error_invalid_values(data, all_arithmetic_operators): # assert_almost_equal stricter, but the expected with pd.NA seems # more-correct than np.nan here. tm.assert_series_equal(res, expected) + elif ( + all_arithmetic_operators + in [ + "__add__", + "__radd__", + ] + and pd.options.future.infer_string + ): + res = ops(str_ser) + if all_arithmetic_operators == "__radd__": + expected = pd.Series( + [np.nan if pd.isna(x) == 1 else "foo" + str(x) for x in data], + index=s.index, + ) + else: + expected = pd.Series( + [np.nan if pd.isna(x) == 1 else str(x) + "foo" for x in data], + index=s.index, + ) + tm.assert_series_equal(res, expected) else: with tm.external_error_raised(TypeError): ops(str_ser) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 01db9a9b9e2e2..a038966d9f2e6 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -249,6 +249,19 @@ def test_mul(dtype): tm.assert_extension_array_equal(result, expected) +def test_add_series(dtype): + arr = pd.array(["a", "b", "c", "d"], dtype=dtype) + df = pd.Series(["t", "y", "v", "w"], dtype=object) + + result = arr + df + expected = pd.Series(["at", "by", "cv", "dw"]).astype(dtype) + tm.assert_series_equal(result, expected) + + result = df + arr + expected = pd.Series(["ta", "yb", "vc", "wd"]).astype(dtype) + tm.assert_series_equal(result, expected) + + def test_add_strings(dtype): arr = pd.array(["a", "b", "c", "d"], dtype=dtype) df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) @@ -263,20 +276,48 @@ def test_add_strings(dtype): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH-28527") def test_add_frame(dtype): arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) - assert arr.__add__(df) is NotImplemented result = arr + df expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) result = df + arr expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "invalid", + [ + 10, + 1.5, + pd.Timedelta(hours=31), + pd.Timestamp("2021-01-01"), + True, + pd.Period("2025-09"), + pd.Categorical(["test"]), + pd.offsets.Minute(3), + pd.Interval(1, 2, closed="right"), + ], +) +def test_add_frame_invalid(dtype, invalid): + arr = pd.array(["a", np.nan], dtype=dtype) + df = pd.DataFrame([[invalid, invalid]]) + + msg = "|".join( + [ + r"can only concatenate str \(not \".+\"\) to str", + r"unsupported operand type\(s\) for \+: '.+' and 'str'", + r"operation 'add' not supported for dtype 'str|string' with dtype '.+'", + "Incompatible type when converting to PyArrow dtype for operation.", + ] + ) + with pytest.raises(TypeError, match=msg): + arr + df def test_comparison_methods_scalar(comparison_op, dtype): From eb12b34bd3f994d5cfdab7cd071c4fb2a1be4640 Mon Sep 17 00:00:00 2001 From: eicchen Date: Sun, 5 Oct 2025 14:55:08 -0500 Subject: [PATCH 03/15] WIP frame addition with fill_value test --- pandas/tests/frame/test_arithmetic.py | 82 ++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 3ddc8c1d56820..9adc80ed672d4 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -626,9 +626,29 @@ def test_arith_flex_frame_corner(self, float_frame): expected = float_frame.sort_index() * np.nan tm.assert_frame_equal(result, expected) - res = float_frame.add(float_frame.iloc[0], fill_value=3) + @pytest.mark.parametrize("axis", [0, 1]) + def test_arith_flex_frame_fill_value_corner(self, float_frame, axis): + rng = np.random.default_rng(60) + mask = rng.random(float_frame.shape) < 0.2 + left = float_frame.mask(mask) + right = left.iloc[0].value - res = float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) + result = left.add(right, axis=axis, fill_value=3) + + if axis == 0: # axis = index, vertical + expected = result * np.nan + + else: # axis = columns, horizontal + right_df = DataFrame([right] * result.shape[0], index=result.index) + left_filled = left.fillna(3) + right_filled = right_df.fillna(3) + expected = right_filled + left_filled + expected = expected.mask(expected == 6, pd.NA) + + print(result) + print(expected) + + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("op", ["add", "sub", "mul", "mod"]) def test_arith_flex_series_ops(self, simple_frame, op): @@ -2190,3 +2210,61 @@ def test_mixed_col_index_dtype(string_dtype_no_object): expected.columns = expected.columns.astype(string_dtype_no_object) tm.assert_frame_equal(result, expected) + + +dt_params = [ + (tm.ALL_INT_NUMPY_DTYPES[0], 5), + (tm.ALL_INT_EA_DTYPES[0], 5), + (tm.FLOAT_NUMPY_DTYPES[0], 4.9), + (tm.FLOAT_EA_DTYPES[0], 4.9), +] + +axes = [0, 1] + + +@pytest.mark.parametrize( + "data_type,fill_val, axis", + [(dt, val, axis) for axis in axes for dt, val in dt_params], +) +def test_df_fill_value_dtype(data_type, fill_val, axis): + # GH 61581 + base_data = np.arange(25).reshape(5, 5) + mult_list = [1, np.nan, 5, np.nan, 3] + np_int_flag = 0 + + try: + mult_data = pd.array(mult_list, dtype=data_type) + except ValueError as e: + # Numpy int type cannot represent NaN, it will end up here + if "cannot convert float NaN to integer" in str(e): + mult_data = np.asarray(mult_list) + np_int_flag = 1 + + columns = list("ABCDE") + df = DataFrame(base_data, columns=columns) + + for i in range(df.shape[0]): + try: + df.iat[i, i] = np.nan + df.iat[i + 1, i] = pd.NA + df.iat[i + 3, i] = pd.NA + except IndexError: + pass + + mult_mat = np.broadcast_to(mult_data, df.shape) + if axis == 0: + mask = np.isnan(mult_mat).T + else: + mask = np.isnan(mult_mat) + mask = df.isna().values & mask + + df_result = df.mul(mult_data, axis=axis, fill_value=fill_val) + if np_int_flag == 1: + mult_np = np.nan_to_num(mult_data, nan=fill_val) + df_expected = (df.fillna(fill_val).mul(mult_np, axis=axis)).mask(mask, np.nan) + else: + df_expected = ( + df.fillna(fill_val).mul(mult_data.fillna(fill_val), axis=axis) + ).mask(mask, np.nan) + + tm.assert_frame_equal(df_result, df_expected) From 7e23b65699336634c715dff5a75e66ad8740e687 Mon Sep 17 00:00:00 2001 From: eicchen Date: Sun, 5 Oct 2025 18:15:21 -0500 Subject: [PATCH 04/15] Completed flex frame fill_value testcase --- pandas/tests/frame/test_arithmetic.py | 34 +++++++++++++++++++-------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 9adc80ed672d4..6a84c1de6fdc1 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -631,22 +631,36 @@ def test_arith_flex_frame_fill_value_corner(self, float_frame, axis): rng = np.random.default_rng(60) mask = rng.random(float_frame.shape) < 0.2 left = float_frame.mask(mask) - right = left.iloc[0].value + right = left.iloc[0] result = left.add(right, axis=axis, fill_value=3) if axis == 0: # axis = index, vertical - expected = result * np.nan + pad_num = abs(result.shape[0] - len(right)) + mult_num = result.shape[1] + right_pad = np.pad( + right, (0, pad_num), mode="constant", constant_values=(np.nan) + ) + right_df = DataFrame( + [right_pad] * mult_num, columns=result.index, index=result.columns + ).T + + left = left.reindex_like(result) else: # axis = columns, horizontal - right_df = DataFrame([right] * result.shape[0], index=result.index) - left_filled = left.fillna(3) - right_filled = right_df.fillna(3) - expected = right_filled + left_filled - expected = expected.mask(expected == 6, pd.NA) - - print(result) - print(expected) + pad_num = abs(result.shape[1] - len(right)) + mult_num = result.shape[0] + right_pad = np.pad( + right, (0, pad_num), mode="constant", constant_values=(np.nan) + ) + right_df = DataFrame( + [right_pad] * mult_num, index=result.index, columns=result.columns + ) + + left_filled = left.fillna(3) + right_filled = right_df.fillna(3) + expected = right_filled + left_filled + expected = expected.mask(expected == 6, pd.NA) tm.assert_frame_equal(result, expected) From 46171081df7738c6a86b709a78320e8a82d81b25 Mon Sep 17 00:00:00 2001 From: eicchen Date: Sun, 5 Oct 2025 18:34:43 -0500 Subject: [PATCH 05/15] Removed type-casting shenanigans in array.py --- pandas/core/arrays/arrow/array.py | 5 ----- .../tests/arrays/integer/test_arithmetic.py | 20 ------------------- 2 files changed, 25 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 29e7c21caced2..ebbaa450c8c3b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -906,11 +906,6 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) - if not (is_scalar(other) or isinstance(other, pa.Scalar)): - if len(other) == 0 or isna(other).any(): - other = other.cast(pa_type) - elif isna(other): - other = other.cast(pa_type) try: if op is operator.add: result = pc.binary_join_element_wise(self._pa_array, other, sep) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 9aece48f2ea38..aeceb9b8a3cb1 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -197,26 +197,6 @@ def test_error_invalid_values(data, all_arithmetic_operators): # assert_almost_equal stricter, but the expected with pd.NA seems # more-correct than np.nan here. tm.assert_series_equal(res, expected) - elif ( - all_arithmetic_operators - in [ - "__add__", - "__radd__", - ] - and pd.options.future.infer_string - ): - res = ops(str_ser) - if all_arithmetic_operators == "__radd__": - expected = pd.Series( - [np.nan if pd.isna(x) == 1 else "foo" + str(x) for x in data], - index=s.index, - ) - else: - expected = pd.Series( - [np.nan if pd.isna(x) == 1 else str(x) + "foo" for x in data], - index=s.index, - ) - tm.assert_series_equal(res, expected) else: with tm.external_error_raised(TypeError): ops(str_ser) From bca56fe87c8974fa90504b0fba2f08c3467e0dd3 Mon Sep 17 00:00:00 2001 From: eicchen Date: Sun, 5 Oct 2025 20:24:39 -0500 Subject: [PATCH 06/15] Removed float addition test, reintroduced check for float type nulls --- pandas/core/arrays/arrow/array.py | 3 ++ .../tests/arrays/floating/test_arithmetic.py | 37 +------------------ pandas/tests/frame/test_arithmetic.py | 12 +++++- 3 files changed, 16 insertions(+), 36 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ebbaa450c8c3b..9a79ed0d99c9f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -906,6 +906,9 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) + if is_scalar(other) or isinstance(other, pa.Scalar): + if len(other) == 0 or isna(other).any(): + other = other.cast(pa_type) try: if op is operator.add: result = pc.binary_join_element_wise(self._pa_array, other, sep) diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index fc2cf1a385d42..a06ef2cc19526 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -153,41 +153,8 @@ def test_error_invalid_values(data, all_arithmetic_operators): # invalid array-likes str_ser = pd.Series("foo", index=s.index) - if ( - all_arithmetic_operators - in [ - "__add__", - "__radd__", - ] - and pd.options.future.infer_string - ): - res = ops(str_ser) - if all_arithmetic_operators == "__radd__": - data_expected = [] - for i in data: - if pd.isna(i): - data_expected.append(i) - elif i.is_integer(): - data_expected.append("foo" + str(int(i))) - else: - data_expected.append("foo" + str(i)) - - expected = pd.Series(data_expected, index=s.index) - else: - data_expected = [] - for i in data: - if pd.isna(i): - data_expected.append(i) - elif i.is_integer(): - data_expected.append(str(int(i)) + "foo") - else: - data_expected.append(str(i) + "foo") - - expected = pd.Series(data_expected, index=s.index) - tm.assert_series_equal(res, expected) - else: - with pytest.raises(TypeError, match=msg): - ops(str_ser) + with pytest.raises(TypeError, match=msg): + ops(str_ser) msg = "|".join( [ diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 6a84c1de6fdc1..718c2a776d2ee 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -708,7 +708,17 @@ def test_arith_flex_zero_len_raises(self): with pytest.raises(TypeError, match=msg): df.add(ser_len0, fill_value="E") - df_len0.sub(df["A"], axis=None, fill_value=3) + result = df_len0.sub(df, axis=None, fill_value=3) + expected = DataFrame([[2, 1], [0, -1]], columns=["A", "B"]) + tm.assert_frame_equal(result, expected, check_dtype=False) + + result = df_len0.sub(df["A"], axis=0, fill_value=3) + expected = DataFrame([[2, 2], [0, 0]], columns=["A", "B"]) + tm.assert_frame_equal(result, expected, check_dtype=False) + + result = df_len0.sub(df["A"], axis=1, fill_value=3) + expected = DataFrame([], columns=["A", "B", 0, 1]) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_flex_add_scalar_fill_value(self): # GH#12723 From 7273396da42be7a792a3c3c1ce8986761c2bfd47 Mon Sep 17 00:00:00 2001 From: eicchen Date: Mon, 6 Oct 2025 18:54:44 -0500 Subject: [PATCH 07/15] Edited pyarrow catch to be more specific --- pandas/core/arrays/arrow/array.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9a79ed0d99c9f..d7dc13c96c22d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -906,9 +906,13 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) - if is_scalar(other) or isinstance(other, pa.Scalar): - if len(other) == 0 or isna(other).any(): - other = other.cast(pa_type) + if ( + pa.types.is_string(other.type) + or pa.types.is_large_string(other.type) + or pa.types.is_binary(other.type) + or isna(other).all() + ): + other = other.cast(pa_type) try: if op is operator.add: result = pc.binary_join_element_wise(self._pa_array, other, sep) From 4493e0838dc5eadc65cecf56c511fa7ec41ee7bb Mon Sep 17 00:00:00 2001 From: eicchen Date: Wed, 8 Oct 2025 13:14:30 -0500 Subject: [PATCH 08/15] Updated fill_value test case, Updated whatsnew --- doc/source/whatsnew/v3.0.0.rst | 4 +-- pandas/tests/frame/test_arithmetic.py | 47 ++++++++++++--------------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e0757bbdb70d7..520f05d809f22 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -200,10 +200,10 @@ Other enhancements - :class:`Holiday` has gained the constructor argument and field ``exclude_dates`` to exclude specific datetimes from a custom holiday calendar (:issue:`54382`) - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) -- :class:`StringDtype` now supports addition to Series/DataFrame with strings (:issue:`61581`) - +- :class:`StringDtype` now supports addition while maintaining element typing (:issue:`61581`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) +- :meth:`DataFrame.add` now supports string addition with null-likes (:issue:`61581`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 718c2a776d2ee..077bf97b4833d 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -627,7 +627,7 @@ def test_arith_flex_frame_corner(self, float_frame): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("axis", [0, 1]) - def test_arith_flex_frame_fill_value_corner(self, float_frame, axis): + def test_arith_flex_frame_fill_value_series(self, float_frame, axis): rng = np.random.default_rng(60) mask = rng.random(float_frame.shape) < 0.2 left = float_frame.mask(mask) @@ -2237,8 +2237,8 @@ def test_mixed_col_index_dtype(string_dtype_no_object): dt_params = [ - (tm.ALL_INT_NUMPY_DTYPES[0], 5), - (tm.ALL_INT_EA_DTYPES[0], 5), + (tm.ALL_INT_NUMPY_DTYPES[0], 10), + (tm.ALL_INT_EA_DTYPES[0], 10), (tm.FLOAT_NUMPY_DTYPES[0], 4.9), (tm.FLOAT_EA_DTYPES[0], 4.9), ] @@ -2250,45 +2250,38 @@ def test_mixed_col_index_dtype(string_dtype_no_object): "data_type,fill_val, axis", [(dt, val, axis) for axis in axes for dt, val in dt_params], ) -def test_df_fill_value_dtype(data_type, fill_val, axis): +def test_df_mul_array_fill_value(data_type, fill_val, axis): # GH 61581 - base_data = np.arange(25).reshape(5, 5) - mult_list = [1, np.nan, 5, np.nan, 3] - np_int_flag = 0 + base_data = np.arange(12).reshape(4, 3) + df = DataFrame(base_data) + mult_list = [np.nan, 1, 5, np.nan] + mult_list = mult_list[: df.shape[axis]] - try: - mult_data = pd.array(mult_list, dtype=data_type) - except ValueError as e: - # Numpy int type cannot represent NaN, it will end up here - if "cannot convert float NaN to integer" in str(e): - mult_data = np.asarray(mult_list) - np_int_flag = 1 + if data_type in tm.ALL_INT_NUMPY_DTYPES: + # Numpy int type cannot represent NaN + mult_np = np.asarray(mult_list) + mult_list = np.nan_to_num(mult_np, nan=fill_val) - columns = list("ABCDE") - df = DataFrame(base_data, columns=columns) + mult_data = pd.array(mult_list, dtype=data_type) for i in range(df.shape[0]): try: df.iat[i, i] = np.nan - df.iat[i + 1, i] = pd.NA - df.iat[i + 3, i] = pd.NA + df.iat[i + 2, i] = pd.NA except IndexError: pass - mult_mat = np.broadcast_to(mult_data, df.shape) if axis == 0: - mask = np.isnan(mult_mat).T + mult_mat = np.broadcast_to(mult_data.reshape(-1, 1), df.shape) + mask = np.isnan(mult_mat) else: + mult_mat = np.broadcast_to(mult_data.reshape(1, -1), df.shape) mask = np.isnan(mult_mat) mask = df.isna().values & mask df_result = df.mul(mult_data, axis=axis, fill_value=fill_val) - if np_int_flag == 1: - mult_np = np.nan_to_num(mult_data, nan=fill_val) - df_expected = (df.fillna(fill_val).mul(mult_np, axis=axis)).mask(mask, np.nan) - else: - df_expected = ( - df.fillna(fill_val).mul(mult_data.fillna(fill_val), axis=axis) - ).mask(mask, np.nan) + df_expected = (df.fillna(fill_val).mul(mult_data.fillna(fill_val), axis=axis)).mask( + mask, np.nan + ) tm.assert_frame_equal(df_result, df_expected) From 406cd15c12490823fffdef57785aa7aa1cf345c1 Mon Sep 17 00:00:00 2001 From: Eric Date: Mon, 13 Oct 2025 15:03:25 -0500 Subject: [PATCH 09/15] Applied PR feedback --- doc/source/whatsnew/v3.0.0.rst | 2 -- pandas/core/arrays/arrow/array.py | 16 +--------------- pandas/tests/arrays/string_/test_string.py | 10 ++++++++-- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 520f05d809f22..11378b3e5ab07 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -200,10 +200,8 @@ Other enhancements - :class:`Holiday` has gained the constructor argument and field ``exclude_dates`` to exclude specific datetimes from a custom holiday calendar (:issue:`54382`) - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) -- :class:`StringDtype` now supports addition while maintaining element typing (:issue:`61581`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) -- :meth:`DataFrame.add` now supports string addition with null-likes (:issue:`61581`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d7dc13c96c22d..2eed608908440 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -890,14 +890,7 @@ def _op_method_error_message(self, other, op) -> str: def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type other_original = other - try: - other = self._box_pa(other) - except (ValueError, pa.lib.ArrowTypeError) as err: - # Categorical and Interval dtype raises errors in self._box_pa - # Could be fixed in the future if needed - raise TypeError( - "Incompatible type when converting to PyArrow dtype for operation." - ) from err + other = self._box_pa(other) if ( pa.types.is_string(pa_type) @@ -906,13 +899,6 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) - if ( - pa.types.is_string(other.type) - or pa.types.is_large_string(other.type) - or pa.types.is_binary(other.type) - or isna(other).all() - ): - other = other.cast(pa_type) try: if op is operator.add: result = pc.binary_join_element_wise(self._pa_array, other, sep) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a038966d9f2e6..681830f7cadbb 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -276,6 +276,7 @@ def test_add_strings(dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(reason="GH-28527") def test_add_frame(dtype): arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) @@ -289,6 +290,11 @@ def test_add_frame(dtype): expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) tm.assert_frame_equal(result, expected, check_dtype=False) + if isinstance(dtype, "str[python]"): + # This ONE dtype actually succeeds the test + # We are manually failing it to maintain continuity + pytest.fail("Manually failed") + @pytest.mark.parametrize( "invalid", @@ -299,8 +305,8 @@ def test_add_frame(dtype): pd.Timestamp("2021-01-01"), True, pd.Period("2025-09"), - pd.Categorical(["test"]), - pd.offsets.Minute(3), + # pd.Categorical(["test"]), #TODO causes box_pa issue, will open issue + # pd.offsets.Minute(3), pd.Interval(1, 2, closed="right"), ], ) From ad9614bb15efe5da660a629f1aa760a66d25bd1b Mon Sep 17 00:00:00 2001 From: eicchen Date: Fri, 17 Oct 2025 21:31:34 -0500 Subject: [PATCH 10/15] Applied various suggestions from jbrock to testcases --- pandas/tests/arrays/string_/test_string.py | 52 +++++----------------- 1 file changed, 11 insertions(+), 41 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 681830f7cadbb..9eeb0381b796e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -251,13 +251,13 @@ def test_mul(dtype): def test_add_series(dtype): arr = pd.array(["a", "b", "c", "d"], dtype=dtype) - df = pd.Series(["t", "y", "v", "w"], dtype=object) + ser = pd.Series(["t", "y", "v", "w"], dtype=object) - result = arr + df + result = arr + ser expected = pd.Series(["at", "by", "cv", "dw"]).astype(dtype) tm.assert_series_equal(result, expected) - result = df + arr + result = ser + arr expected = pd.Series(["ta", "yb", "vc", "wd"]).astype(dtype) tm.assert_series_equal(result, expected) @@ -277,53 +277,23 @@ def test_add_strings(dtype): @pytest.mark.xfail(reason="GH-28527") -def test_add_frame(dtype): +def test_add_frame(request, dtype): + if isinstance(dtype, "str[python]"): + # This ONE dtype actually succeeds the test + # We are manually failing it to maintain continuity + mark = pytest.mark.xfail(reason="[XPASS(strict)] GH-28527") + request.applymarker(mark) arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert arr.__add__(df) is NotImplemented result = arr + df expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected) result = df + arr expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected, check_dtype=False) - - if isinstance(dtype, "str[python]"): - # This ONE dtype actually succeeds the test - # We are manually failing it to maintain continuity - pytest.fail("Manually failed") - - -@pytest.mark.parametrize( - "invalid", - [ - 10, - 1.5, - pd.Timedelta(hours=31), - pd.Timestamp("2021-01-01"), - True, - pd.Period("2025-09"), - # pd.Categorical(["test"]), #TODO causes box_pa issue, will open issue - # pd.offsets.Minute(3), - pd.Interval(1, 2, closed="right"), - ], -) -def test_add_frame_invalid(dtype, invalid): - arr = pd.array(["a", np.nan], dtype=dtype) - df = pd.DataFrame([[invalid, invalid]]) - - msg = "|".join( - [ - r"can only concatenate str \(not \".+\"\) to str", - r"unsupported operand type\(s\) for \+: '.+' and 'str'", - r"operation 'add' not supported for dtype 'str|string' with dtype '.+'", - "Incompatible type when converting to PyArrow dtype for operation.", - ] - ) - with pytest.raises(TypeError, match=msg): - arr + df + tm.assert_frame_equal(result, expected) def test_comparison_methods_scalar(comparison_op, dtype): From 73d168b28175b6b3aff81f547bc00adf0bc5c5dd Mon Sep 17 00:00:00 2001 From: Eric Date: Mon, 20 Oct 2025 15:00:20 -0500 Subject: [PATCH 11/15] Updated catch for str[python[ in test_add_frame --- pandas/tests/arrays/string_/test_string.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 9eeb0381b796e..28ab95e5a1b2a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -278,11 +278,9 @@ def test_add_strings(dtype): @pytest.mark.xfail(reason="GH-28527") def test_add_frame(request, dtype): - if isinstance(dtype, "str[python]"): - # This ONE dtype actually succeeds the test - # We are manually failing it to maintain continuity - mark = pytest.mark.xfail(reason="[XPASS(strict)] GH-28527") - request.applymarker(mark) + # "str[python]" actually safely adds but with the wrong dtype + # result = + # expected = arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert arr.__add__(df) is NotImplemented From 3d3a2a607ce304c9a7516a00b900676de31e6526 Mon Sep 17 00:00:00 2001 From: Eric Date: Mon, 20 Oct 2025 15:04:46 -0500 Subject: [PATCH 12/15] reupdated str[python] catch after git shenanigans --- pandas/tests/arrays/string_/test_string.py | 38 +++++----------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index d4e90cc9b1abf..28ab95e5a1b2a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -61,6 +61,14 @@ def string_dtype_highest_priority(dtype1, dtype2): return DTYPE_HIERARCHY[max(h1, h2)] +def test_dtype_constructor(): + pytest.importorskip("pyarrow") + + with tm.assert_produces_warning(FutureWarning): + dtype = pd.StringDtype("pyarrow_numpy") + assert dtype == pd.StringDtype("pyarrow", na_value=np.nan) + + def test_dtype_equality(): pytest.importorskip("pyarrow") @@ -531,8 +539,7 @@ def test_astype_float(dtype, any_float_dtype): # Don't compare arrays (37974) ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype) result = ser.astype(any_float_dtype) - item = np.nan if isinstance(result.dtype, np.dtype) else pd.NA - expected = pd.Series([1.1, item, 3.3], dtype=any_float_dtype) + expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_dtype) tm.assert_series_equal(result, expected) @@ -846,30 +853,3 @@ def test_string_array_view_type_error(): arr = pd.array(["a", "b", "c"], dtype="string") with pytest.raises(TypeError, match="Cannot change data-type for string array."): arr.view("i8") - - -@pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_numpy_array_ufunc(dtype, box): - arr = box(["a", "bb", "ccc"], dtype=dtype) - - # custom ufunc that works with string (object) input -> returning numeric - str_len_ufunc = np.frompyfunc(lambda x: len(x), 1, 1) - result = str_len_ufunc(arr) - expected_cls = pd.Series if box is pd.Series else np.array - # TODO we should infer int64 dtype here? - expected = expected_cls([1, 2, 3], dtype=object) - tm.assert_equal(result, expected) - - # custom ufunc returning strings - str_multiply_ufunc = np.frompyfunc(lambda x: x * 2, 1, 1) - result = str_multiply_ufunc(arr) - expected = box(["aa", "bbbb", "cccccc"], dtype=dtype) - if dtype.storage == "pyarrow": - # TODO ArrowStringArray should also preserve the class / dtype - if box is pd.array: - expected = np.array(["aa", "bbbb", "cccccc"], dtype=object) - else: - # not specifying the dtype because the exact dtype is not yet preserved - expected = pd.Series(["aa", "bbbb", "cccccc"]) - - tm.assert_equal(result, expected) From 76a122e05d71d6be5a58e9aa153891a5328cad13 Mon Sep 17 00:00:00 2001 From: Eric Date: Tue, 21 Oct 2025 14:01:45 -0500 Subject: [PATCH 13/15] Update python test_add_frame catch to better reflect issues across different enviroments --- pandas/tests/arrays/string_/test_string.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 28ab95e5a1b2a..b47b6f654e84c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -278,9 +278,12 @@ def test_add_strings(dtype): @pytest.mark.xfail(reason="GH-28527") def test_add_frame(request, dtype): - # "str[python]" actually safely adds but with the wrong dtype - # result = - # expected = + if dtype.storage == "python": + # Inconsistent behavior between different versions of the python engine. + # Some return correctly, some return but with the wrong dtype + # Others just fail, we are blanket failing all + mark = pytest.mark.xfail(reason="[XPASS(strict)] GH-28527") + request.node.applymarker(mark) arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert arr.__add__(df) is NotImplemented From 1d3260e65032bc3f94bdf70ec57e7913466fac0e Mon Sep 17 00:00:00 2001 From: Eric Date: Tue, 21 Oct 2025 14:54:25 -0500 Subject: [PATCH 14/15] I have no idea where that test came from --- pandas/tests/arrays/string_/test_string.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b47b6f654e84c..bc0adf1bdf500 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -61,14 +61,6 @@ def string_dtype_highest_priority(dtype1, dtype2): return DTYPE_HIERARCHY[max(h1, h2)] -def test_dtype_constructor(): - pytest.importorskip("pyarrow") - - with tm.assert_produces_warning(FutureWarning): - dtype = pd.StringDtype("pyarrow_numpy") - assert dtype == pd.StringDtype("pyarrow", na_value=np.nan) - - def test_dtype_equality(): pytest.importorskip("pyarrow") From 30d4a072897f932ae1eaf8bfbb958fbb861fdd3c Mon Sep 17 00:00:00 2001 From: Eric Date: Tue, 28 Oct 2025 16:05:31 -0500 Subject: [PATCH 15/15] Edited original merge to be inline with #62869, added 'object' as dtype to be captured in frame:to_series --- pandas/core/frame.py | 5 +- pandas/tests/arithmetic/test_string.py | 12 ++- pandas/tests/arrays/string_/test_string.py | 107 --------------------- 3 files changed, 11 insertions(+), 113 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9e9866c5bb20f..f677f1e37a016 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8491,7 +8491,10 @@ def to_series(right): # pass dtype to avoid doing inference, which would break consistency # with Index/Series ops dtype = None - if getattr(right, "dtype", None) == object: + if ( + getattr(right, "dtype", None) == "object" + or getattr(right, "dtype", None) == object + ): # can't pass right.dtype unconditionally as that would break on e.g. # datetime64[h] ndarray dtype = object diff --git a/pandas/tests/arithmetic/test_string.py b/pandas/tests/arithmetic/test_string.py index 66b7a5bd440c3..08a276a86a934 100644 --- a/pandas/tests/arithmetic/test_string.py +++ b/pandas/tests/arithmetic/test_string.py @@ -244,9 +244,6 @@ def test_mul(any_string_dtype): def test_add_strings(any_string_dtype, request): dtype = any_string_dtype - if dtype != np.dtype(object): - mark = pytest.mark.xfail(reason="GH-28527") - request.applymarker(mark) arr = pd.array(["a", "b", "c", "d"], dtype=dtype) df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) assert arr.__add__(df) is NotImplemented @@ -261,10 +258,15 @@ def test_add_strings(any_string_dtype, request): @pytest.mark.xfail(reason="GH-28527") -def test_add_frame(dtype): +def test_add_frame(request, dtype): + if dtype.storage == "python": + # Inconsistent behavior between different versions of the python engine. + # Some return correctly, some return but with the wrong dtype + # Others just fail, we are blanket failing all + mark = pytest.mark.xfail(reason="[XPASS(strict)] GH-28527") + request.node.applymarker(mark) arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) - assert arr.__add__(df) is NotImplemented result = arr + df diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ae5aa76bb213a..413df674a5deb 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -159,113 +159,6 @@ def test_astype_roundtrip(dtype): tm.assert_series_equal(result2, ser2) -def test_add(dtype): - a = pd.Series(["a", "b", "c", None, None], dtype=dtype) - b = pd.Series(["x", "y", None, "z", None], dtype=dtype) - - result = a + b - expected = pd.Series(["ax", "by", None, None, None], dtype=dtype) - tm.assert_series_equal(result, expected) - - result = a.add(b) - tm.assert_series_equal(result, expected) - - result = a.radd(b) - expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype) - tm.assert_series_equal(result, expected) - - result = a.add(b, fill_value="-") - expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype) - tm.assert_series_equal(result, expected) - - -def test_add_2d(dtype, request): - if dtype.storage == "pyarrow": - reason = "Failed: DID NOT RAISE " - mark = pytest.mark.xfail(raises=None, reason=reason) - request.applymarker(mark) - - a = pd.array(["a", "b", "c"], dtype=dtype) - b = np.array([["a", "b", "c"]], dtype=object) - with pytest.raises(ValueError, match="3 != 1"): - a + b - - s = pd.Series(a) - with pytest.raises(ValueError, match="3 != 1"): - s + b - - -def test_add_sequence(dtype): - a = pd.array(["a", "b", None, None], dtype=dtype) - other = ["x", None, "y", None] - - result = a + other - expected = pd.array(["ax", None, None, None], dtype=dtype) - tm.assert_extension_array_equal(result, expected) - - result = other + a - expected = pd.array(["xa", None, None, None], dtype=dtype) - tm.assert_extension_array_equal(result, expected) - - -def test_mul(dtype): - a = pd.array(["a", "b", None], dtype=dtype) - result = a * 2 - expected = pd.array(["aa", "bb", None], dtype=dtype) - tm.assert_extension_array_equal(result, expected) - - result = 2 * a - tm.assert_extension_array_equal(result, expected) - - -def test_add_series(dtype): - arr = pd.array(["a", "b", "c", "d"], dtype=dtype) - ser = pd.Series(["t", "y", "v", "w"], dtype=object) - - result = arr + ser - expected = pd.Series(["at", "by", "cv", "dw"]).astype(dtype) - tm.assert_series_equal(result, expected) - - result = ser + arr - expected = pd.Series(["ta", "yb", "vc", "wd"]).astype(dtype) - tm.assert_series_equal(result, expected) - - -def test_add_strings(dtype): - arr = pd.array(["a", "b", "c", "d"], dtype=dtype) - df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) - assert arr.__add__(df) is NotImplemented - - result = arr + df - expected = pd.DataFrame([["at", "by", "cv", "dw"]]).astype(dtype) - tm.assert_frame_equal(result, expected) - - result = df + arr - expected = pd.DataFrame([["ta", "yb", "vc", "wd"]]).astype(dtype) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.xfail(reason="GH-28527") -def test_add_frame(request, dtype): - if dtype.storage == "python": - # Inconsistent behavior between different versions of the python engine. - # Some return correctly, some return but with the wrong dtype - # Others just fail, we are blanket failing all - mark = pytest.mark.xfail(reason="[XPASS(strict)] GH-28527") - request.node.applymarker(mark) - arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) - df = pd.DataFrame([["x", np.nan, "y", np.nan]]) - assert arr.__add__(df) is NotImplemented - - result = arr + df - expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected) - - result = df + arr - expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected) - - def test_constructor_raises(cls): if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA"