diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 75b4c5c0fe14d..671e3f9b86fbf 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -371,6 +371,63 @@ In cases with mixed-resolution inputs, the highest resolution is used: In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype Out[2]: dtype(' Index: """ Extract combined index: return intersection or union (depending on the @@ -81,7 +81,8 @@ def get_objs_combined_axis( axis : {0 or 'index', 1 or 'outer'}, default 0 The axis to extract indexes from. sort : bool, default True - Whether the result index should come out sorted or not. + Whether the result index should come out sorted or not. NoDefault + use for deprecation in GH#57335. Returns ------- @@ -108,7 +109,7 @@ def _get_distinct_objs(objs: list[Index]) -> list[Index]: def _get_combined_index( indexes: list[Index], intersect: bool = False, - sort: bool = False, + sort: bool | lib.NoDefault = False, ) -> Index: """ Return the union or intersection of indexes. @@ -121,7 +122,8 @@ def _get_combined_index( If True, calculate the intersection between indexes. Otherwise, calculate the union. sort : bool, default False - Whether the result index should come out sorted or not. + Whether the result index should come out sorted or not. NoDefault + used for deprecation of GH#57335 Returns ------- @@ -138,10 +140,10 @@ def _get_combined_index( for other in indexes[1:]: index = index.intersection(other) else: - index = union_indexes(indexes, sort=False) + index = union_indexes(indexes, sort=sort if sort is lib.no_default else False) index = ensure_index(index) - if sort: + if sort and sort is not lib.no_default: index = safe_sort_index(index) return index @@ -180,7 +182,7 @@ def safe_sort_index(index: Index) -> Index: return index -def union_indexes(indexes, sort: bool | None = True) -> Index: +def union_indexes(indexes, sort: bool | None | lib.NoDefault = True) -> Index: """ Return the union of indexes. @@ -190,7 +192,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: ---------- indexes : list of Index or list objects sort : bool, default True - Whether the result index should come out sorted or not. + Whether the result index should come out sorted or not. NoDefault + used for deprecation of GH#57335. Returns ------- @@ -201,7 +204,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: if len(indexes) == 1: result = indexes[0] if isinstance(result, list): - if not sort: + if not sort or sort is lib.no_default: result = Index(result) else: result = Index(sorted(result)) @@ -227,7 +230,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if num_dtis == len(indexes): - sort = True + if sort is lib.no_default: + sort = True result = indexes[0] elif num_dtis > 1: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index e39c716784455..4124ea3651e62 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -45,7 +45,9 @@ ensure_index, get_objs_combined_axis, get_unanimous_names, + union_indexes, ) +from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.internals import concatenate_managers if TYPE_CHECKING: @@ -162,7 +164,7 @@ def concat( levels=None, names: list[HashableT] | None = None, verify_integrity: bool = False, - sort: bool = False, + sort: bool | lib.NoDefault = lib.no_default, copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Series: """ @@ -405,13 +407,41 @@ def concat( "Only can inner (intersect) or outer (union) join the other axis" ) - if not is_bool(sort): + objs, keys, ndims = _clean_keys_and_objs(objs, keys) + + if sort is lib.no_default: + if axis == 0: + non_concat_axis = [ + obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name]) + for obj in objs + ] + else: + non_concat_axis = [obj.index for obj in objs] + + if ( + intersect + or any(not isinstance(index, DatetimeIndex) for index in non_concat_axis) + or all( + id(prev) == id(curr) + for prev, curr in zip(non_concat_axis, non_concat_axis[1:]) + ) + or ( + all( + prev[-1] <= curr[0] and prev.is_monotonic_increasing + for prev, curr in zip(non_concat_axis, non_concat_axis[1:]) + if not prev.empty and not curr.empty + ) + and non_concat_axis[-1].is_monotonic_increasing + ) + ): + # Sorting or not will not impact the result. + sort = False + elif not is_bool(sort): raise ValueError( f"The 'sort' keyword only accepts boolean values; {sort} was passed." ) - sort = bool(sort) - - objs, keys, ndims = _clean_keys_and_objs(objs, keys) + else: + sort = bool(sort) # select an object to be our result reference sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect) @@ -436,9 +466,10 @@ def concat( if len(ndims) > 1: objs = _sanitize_mixed_ndim(objs, sample, ignore_index, bm_axis) + orig_axis = axis axis = 1 - bm_axis if is_frame else 0 names = names or getattr(keys, "names", None) - return _get_result( + result = _get_result( objs, is_series, bm_axis, @@ -452,6 +483,28 @@ def concat( axis, ) + if sort is lib.no_default: + if orig_axis == 0: + non_concat_axis = [ + obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name]) + for obj in objs + ] + else: + non_concat_axis = [obj.index for obj in objs] + no_sort_result_index = union_indexes(non_concat_axis, sort=False) + orig = result.index if orig_axis == 1 else result.columns + if not no_sort_result_index.equals(orig): + msg = ( + "Sorting by default when concatenating all DatetimeIndex is " + "deprecated. In the future, pandas will respect the default " + "of `sort=False`. Specify `sort=True` or `sort=False` to " + "silence this message. If you see this warnings when not " + "directly calling concat, report a bug to pandas." + ) + warnings.warn(msg, Pandas4Warning, stacklevel=find_stack_level()) + + return result + def _sanitize_mixed_ndim( objs: list[Series | DataFrame], @@ -510,7 +563,7 @@ def _get_result( bm_axis: AxisInt, ignore_index: bool, intersect: bool, - sort: bool, + sort: bool | lib.NoDefault, keys: Iterable[Hashable] | None, levels, verify_integrity: bool, @@ -620,7 +673,7 @@ def new_axes( objs: list[Series | DataFrame], bm_axis: AxisInt, intersect: bool, - sort: bool, + sort: bool | lib.NoDefault, keys: Iterable[Hashable] | None, names: list[HashableT] | None, axis: AxisInt, diff --git a/pandas/tests/groupby/methods/test_groupby_shift_diff.py b/pandas/tests/groupby/methods/test_groupby_shift_diff.py index 1256046d81949..60e5fd9fa1863 100644 --- a/pandas/tests/groupby/methods/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/methods/test_groupby_shift_diff.py @@ -248,3 +248,21 @@ def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): msg = "Passing a 'freq' together with a 'fill_value'" with pytest.raises(ValueError, match=msg): df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h") + + +def test_groupby_shift_multiple_periods_unsorted_index(): + # https://github.com/pandas-dev/pandas/pull/62843 + idx = date_range("1/1/2000", periods=4, freq="h") + df = DataFrame( + {"a": [1, 2, 3], "b": [True, True, False]}, + index=[idx[2], idx[0], idx[1]], + ) + result = df.groupby("b")[["a"]].shift([0, 1], freq="h") + expected = DataFrame( + { + "a_0": [1.0, 2.0, 3.0, np.nan], + "a_1": [3.0, np.nan, 2.0, 1.0], + }, + index=[idx[2], idx[0], idx[1], idx[3]], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 2d0eb5d14a1d9..7d0e534cb7689 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -10,7 +10,10 @@ import numpy as np import pytest -from pandas.errors import InvalidIndexError +from pandas.errors import ( + InvalidIndexError, + Pandas4Warning, +) import pandas as pd from pandas import ( @@ -434,7 +437,9 @@ def test_concat_bug_1719(self): # to join with union # these two are of different length! left = concat([ts1, ts2], join="outer", axis=1) - right = concat([ts2, ts1], join="outer", axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + right = concat([ts2, ts1], join="outer", axis=1) assert len(left) == len(right) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 0cf3192ea3a74..b1cba7ee31eac 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.errors import Pandas4Warning + import pandas as pd from pandas import ( DataFrame, @@ -69,7 +71,9 @@ def test_concat_datetime_timezone(self): idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo") df3 = DataFrame({"b": [1, 2, 3]}, index=idx3) - result = concat([df1, df3], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + result = concat([df1, df3], axis=1) exp_idx = DatetimeIndex( [