diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c41524619de95..0796c58cd9dfc 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -976,6 +976,7 @@ Datetimelike - Bug in comparison between objects with pyarrow date dtype and ``timestamp[pyarrow]`` or ``np.datetime64`` dtype failing to consider these as non-comparable (:issue:`62157`) - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) +- Bug in retaining frequency in :meth:`value_counts` specifically for :meth:`DatetimeIndex` and :meth:`TimedeltaIndex` (:issue:`33830`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bbca78459ca75..5f7a6ed2a82a4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -863,8 +863,10 @@ def value_counts_internal( dropna: bool = True, ) -> Series: from pandas import ( + DatetimeIndex, Index, Series, + TimedeltaIndex, ) index_name = getattr(values, "name", None) @@ -929,6 +931,31 @@ def value_counts_internal( # Starting in 3.0, we no longer perform dtype inference on the # Index object we construct here, xref GH#56161 idx = Index(keys, dtype=keys.dtype, name=index_name) + + if ( + bins is None + and not sort + and isinstance(values, (DatetimeIndex, TimedeltaIndex)) + and values.inferred_freq is not None + ): + # freq preservation + # Check if the result would be the same as input + if len(idx) == len(values) and idx.equals(values): + # Rebuild idx with the correct type and inferred frequency + if isinstance(values, DatetimeIndex): + idx = DatetimeIndex( + idx._data if hasattr(idx, "_data") else idx.values, + freq=values.inferred_freq, + name=idx.name, + ) + + elif isinstance(values, TimedeltaIndex): + idx = TimedeltaIndex( + idx._data if hasattr(idx, "_data") else idx.values, + freq=values.inferred_freq, + name=idx.name, + ) + result = Series(counts, index=idx, name=name, copy=False) if sort: diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index bcb31829a201f..5e102c15f04f9 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -339,3 +339,153 @@ def test_value_counts_object_inference_deprecated(): exp = dti.value_counts() exp.index = exp.index.astype(object) tm.assert_series_equal(res, exp) + + +def _vc_make_index(kind: str, periods=5, freq="D"): + if kind == "dt": + return pd.date_range("2016-01-01", periods=periods, freq=freq) + if kind == "td": + return pd.timedelta_range(Timedelta(0), periods=periods, freq=freq) + raise ValueError("kind must be 'dt' or 'td'") + + +@pytest.mark.parametrize( + "kind,freq,normalize", + [ + ("dt", "D", False), + ("dt", "D", True), + ("td", "D", False), + ("td", "D", True), + ("td", Timedelta(hours=1), False), + ("td", Timedelta(hours=1), True), + ], +) +def test_value_counts_freq_preserved_datetimelike_no_sort(kind, freq, normalize): + idx = _vc_make_index(kind, periods=5, freq=freq) + vc = idx.value_counts(sort=False, normalize=normalize) + assert vc.index.freq == idx.freq + if normalize: + assert np.isclose(vc.values, 1 / len(idx)).all() + + +@pytest.mark.parametrize( + "kind,freq", + [ + ("dt", "D"), + ("td", "D"), + ("td", Timedelta(hours=1)), + ], +) +def test_value_counts_freq_drops_datetimelike_when_sorted(kind, freq): + idx = _vc_make_index(kind, periods=5, freq=freq) + vc = idx.value_counts() # default sort=True (reorders) + assert vc.index.freq is None + + +@pytest.mark.parametrize( + "kind,freq", + [ + ("dt", "D"), + ("td", "D"), + ("td", Timedelta(hours=1)), + ], +) +def test_value_counts_freq_drops_datetimelike_with_duplicates(kind, freq): + base = _vc_make_index(kind, periods=5, freq=freq) + obj = base.insert(1, base[1]) # duplicate one label + vc = obj.value_counts(sort=False) + assert vc.index.freq is None + + +@pytest.mark.parametrize( + "kind,freq", + [ + ("dt", "D"), + ("td", "D"), + ("td", Timedelta(hours=1)), + ], +) +def test_value_counts_freq_drops_datetimelike_with_gap(kind, freq): + base = _vc_make_index(kind, periods=5, freq=freq) + obj = base.delete(2) # remove one step to break contiguity + vc = obj.value_counts(sort=False) + assert vc.index.freq is None + + +@pytest.mark.parametrize( + "kind,freq,dropna,expect_hasnans", + [ + ("dt", "D", False, True), # keep NaT + ("dt", "D", True, False), # drop NaT + ("td", "D", False, True), + ("td", "D", True, False), + ("td", Timedelta(hours=1), False, True), + ("td", Timedelta(hours=1), True, False), + ], +) +def test_value_counts_freq_drops_datetimelike_with_nat( + kind, freq, dropna, expect_hasnans +): + base = _vc_make_index(kind, periods=3, freq=freq) + obj = base.insert(1, pd.NaT) + vc = obj.value_counts(dropna=dropna, sort=False) + assert vc.index.freq is None + assert vc.index.hasnans is expect_hasnans + + +@pytest.mark.parametrize( + "freq,start,periods,sort", + [ + ("D", "2016-01-01", 5, False), + ("D", "2016-01-01", 5, True), + ("M", "2016-01", 6, False), # MonthEnd + ("M", "2016-01", 6, True), + ("Q-DEC", "2016Q1", 4, False), # QuarterEnd (Dec anchored) + ("Q-DEC", "2016Q1", 4, True), + ("Y-DEC", "2014", 3, False), # YearEnd (Dec anchored) + ("Y-DEC", "2014", 3, True), + ], +) +def test_value_counts_period_freq_preserved_sort_and_nosort(freq, start, periods, sort): + pi = pd.period_range(start=start, periods=periods, freq=freq) + vc = pi.value_counts(sort=sort) + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq + + +def test_value_counts_period_freq_preserved_with_duplicates(): + pi = pd.period_range("2016-01", periods=5, freq="M") + obj = pi.insert(1, pi[1]) # duplicate one label + vc = obj.value_counts(sort=False) + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq + + +def test_value_counts_period_freq_preserved_with_gap(): + pi = pd.period_range("2016-01", periods=5, freq="M") + obj = pi.delete(2) # remove one element + vc = obj.value_counts(sort=False) + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq + + +def test_value_counts_period_freq_preserved_with_normalize(): + pi = pd.period_range("2016-01", periods=4, freq="M") + vc = pi.value_counts(normalize=True, sort=False) + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq + assert np.isclose(vc.values, 1 / len(pi)).all() + + +def test_value_counts_period_freq_preserved_with_nat_dropna_true(): + pi = pd.period_range("2016-01", periods=5, freq="M") + obj = pi.insert(1, pd.NaT) + vc = obj.value_counts(dropna=True, sort=False) + assert not vc.index.hasnans + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq