From 866075d1062363734412e127a16920f8e1e89396 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Sep 2025 15:14:36 +0200 Subject: [PATCH 1/2] String dtype: keep select_dtypes(include=object) selecting string columns --- pandas/core/dtypes/cast.py | 4 +++- pandas/core/frame.py | 12 ++++++++---- .../tests/frame/methods/test_select_dtypes.py | 18 ++++++++++-------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a3634e370cfc3..d387b2fcb1796 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -865,7 +865,9 @@ def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None: np.dtype(" np.ndarray: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c1f8be1381b23..4d89a52fb4c71 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5146,10 +5146,14 @@ def check_int_infer_dtype(dtypes): def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: # GH 46870: BooleanDtype._is_numeric == True but should be excluded dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype - return issubclass(dtype.type, tuple(dtypes_set)) or ( - np.number in dtypes_set - and getattr(dtype, "_is_numeric", False) - and not is_bool_dtype(dtype) + return ( + issubclass(dtype.type, tuple(dtypes_set)) + or ( + np.number in dtypes_set + and getattr(dtype, "_is_numeric", False) + and not is_bool_dtype(dtype) + ) + or (dtype.type is str and np.object_ in dtypes_set) ) def predicate(arr: ArrayLike) -> bool: diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 0354e9df3d168..d3e28d328c8fd 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -102,6 +102,10 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ri = df.select_dtypes(include=[str]) tm.assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=["object"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -309,17 +313,15 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_strin df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - if using_infer_string: - e = df[["b"]] - else: - e = df[["a", "b"]] + # if using_infer_string: + # TODO warn + e = df[["a", "b"]] tm.assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - if using_infer_string: - e = df[["b", "g"]] - else: - e = df[["a", "b", "g"]] + # if using_infer_string: + # TODO warn + e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) def test_select_dtypes_empty(self): From 46c0731a39718361976396fa71f650e30c86fbfe Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Sep 2025 09:17:55 +0200 Subject: [PATCH 2/2] add whatsnew --- doc/source/whatsnew/v2.3.3.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index aaed7544d9975..0a029f683b6cb 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -18,6 +18,16 @@ Most changes in this release are related to :class:`StringDtype` which will become the default string dtype in pandas 3.0. See :ref:`whatsnew_230.upcoming_changes` for more details. +.. _whatsnew_233.string_fixes.improvements: + +Improvements +^^^^^^^^^^^^ +- Update :meth:`DataFrame.select_dtypes` to keep selecting ``str`` columns when + specifying ``include=["object"]`` for backwards compatibility. In a future + release, this will be deprecated and code for pandas 3+ should be updated to + do ``include=["str"]`` (:issue:`61916`) + + .. _whatsnew_233.string_fixes.bugs: Bug fixes