From 52e07b59ed9f4bb17245f4d305057238659360fc Mon Sep 17 00:00:00 2001 From: Zrahay Date: Sat, 25 Oct 2025 22:04:23 +0530 Subject: [PATCH 1/8] BUG: Validate numeric_only parameter in groupby aggregations --- pandas/core/groupby/groupby.py | 72 ++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fe7bf5bbc4c2c..5714e0e5d5879 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1754,44 +1754,48 @@ def _cython_agg_general( **kwargs, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; - # that goes through SeriesGroupBy + # that goes through SeriesGroupBy - data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) + # Check to confirm numeric_only is fed either True or False and no other data type + if(isinstance(numeric_only, bool)): + data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) - def array_func(values: ArrayLike) -> ArrayLike: - try: - result = self._grouper._cython_operation( - "aggregate", - values, - how, - axis=data.ndim - 1, - min_count=min_count, - **kwargs, - ) - except NotImplementedError: - # generally if we have numeric_only=False - # and non-applicable functions - # try to python agg - # TODO: shouldn't min_count matter? - # TODO: avoid special casing SparseArray here - if how in ["any", "all"] and isinstance(values, SparseArray): - pass - elif alt is None or how in ["any", "all", "std", "sem"]: - raise # TODO: re-raise as TypeError? should not be reached - else: - return result + def array_func(values: ArrayLike) -> ArrayLike: + try: + result = self._grouper._cython_operation( + "aggregate", + values, + how, + axis=data.ndim - 1, + min_count=min_count, + **kwargs, + ) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + # TODO: shouldn't min_count matter? + # TODO: avoid special casing SparseArray here + if how in ["any", "all"] and isinstance(values, SparseArray): + pass + elif alt is None or how in ["any", "all", "std", "sem"]: + raise # TODO: re-raise as TypeError? should not be reached + else: + return result - assert alt is not None - result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) - return result + assert alt is not None + result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) + return result - new_mgr = data.grouped_reduce(array_func) - res = self._wrap_agged_manager(new_mgr) - if how in ["idxmin", "idxmax"]: - # mypy expects how to be Literal["idxmin", "idxmax"]. - res = self._wrap_idxmax_idxmin(res, how=how, skipna=kwargs["skipna"]) # type: ignore[arg-type] - out = self._wrap_aggregated_output(res) - return out + new_mgr = data.grouped_reduce(array_func) + res = self._wrap_agged_manager(new_mgr) + if how in ["idxmin", "idxmax"]: + # mypy expects how to be Literal["idxmin", "idxmax"]. + res = self._wrap_idxmax_idxmin(res, how=how, skipna=kwargs["skipna"]) # type: ignore[arg-type] + out = self._wrap_aggregated_output(res) + return out + else: + raise ValueError("numeric_only accepts only Boolean values") def _cython_transform(self, how: str, numeric_only: bool = False, **kwargs): raise AbstractMethodError(self) From 9655a9a06440a25b8e9e400536637891dda5a88d Mon Sep 17 00:00:00 2001 From: Zrahay Date: Sun, 26 Oct 2025 03:18:44 +0530 Subject: [PATCH 2/8] Removed the if condition and added is_bool. Removed the unecessary comment on line 1759 and reverted the comment change on line 1757 --- pandas/core/groupby/groupby.py | 72 +++++++++++++++++----------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5714e0e5d5879..0ca9c1a72d6ea 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -86,6 +86,7 @@ class providing the base-class of operations. is_object_dtype, is_scalar, is_string_dtype, + is_bool, needs_i8_conversion, pandas_dtype, ) @@ -1756,46 +1757,45 @@ def _cython_agg_general( # Note: we never get here with how="ohlc" for DataFrameGroupBy; # that goes through SeriesGroupBy - # Check to confirm numeric_only is fed either True or False and no other data type - if(isinstance(numeric_only, bool)): - data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) + if not is_bool(numeric_only): + raise ValueError("numeric_only accepts only Boolean values") - def array_func(values: ArrayLike) -> ArrayLike: - try: - result = self._grouper._cython_operation( - "aggregate", - values, - how, - axis=data.ndim - 1, - min_count=min_count, - **kwargs, - ) - except NotImplementedError: - # generally if we have numeric_only=False - # and non-applicable functions - # try to python agg - # TODO: shouldn't min_count matter? - # TODO: avoid special casing SparseArray here - if how in ["any", "all"] and isinstance(values, SparseArray): - pass - elif alt is None or how in ["any", "all", "std", "sem"]: - raise # TODO: re-raise as TypeError? should not be reached - else: - return result + data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) - assert alt is not None - result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) + def array_func(values: ArrayLike) -> ArrayLike: + try: + result = self._grouper._cython_operation( + "aggregate", + values, + how, + axis=data.ndim - 1, + min_count=min_count, + **kwargs, + ) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + # TODO: shouldn't min_count matter? + # TODO: avoid special casing SparseArray here + if how in ["any", "all"] and isinstance(values, SparseArray): + pass + elif alt is None or how in ["any", "all", "std", "sem"]: + raise # TODO: re-raise as TypeError? should not be reached + else: return result - new_mgr = data.grouped_reduce(array_func) - res = self._wrap_agged_manager(new_mgr) - if how in ["idxmin", "idxmax"]: - # mypy expects how to be Literal["idxmin", "idxmax"]. - res = self._wrap_idxmax_idxmin(res, how=how, skipna=kwargs["skipna"]) # type: ignore[arg-type] - out = self._wrap_aggregated_output(res) - return out - else: - raise ValueError("numeric_only accepts only Boolean values") + assert alt is not None + result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) + return result + + new_mgr = data.grouped_reduce(array_func) + res = self._wrap_agged_manager(new_mgr) + if how in ["idxmin", "idxmax"]: + # mypy expects how to be Literal["idxmin", "idxmax"]. + res = self._wrap_idxmax_idxmin(res, how=how, skipna=kwargs["skipna"]) # type: ignore[arg-type] + out = self._wrap_aggregated_output(res) + return out def _cython_transform(self, how: str, numeric_only: bool = False, **kwargs): raise AbstractMethodError(self) From 6ef5696520b37c86d3331635b2c555dad09f88c3 Mon Sep 17 00:00:00 2001 From: Zrahay Date: Sun, 26 Oct 2025 04:15:30 +0530 Subject: [PATCH 3/8] Added a note to the whatsnew 3.0 with respect to my issue 62778 --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 75b4c5c0fe14d..c5c9e9d589456 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1150,6 +1150,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` would fail when the groups were :class:`Categorical` with an NA value (:issue:`61356`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) +- Bug in :meth:`.DataFrameGroupBy` reductions where boolean-valued inputs were mishandled in the Cython aggregation path (``_cython_agg_general``); adding an ``is_bool`` check fixes incorrect results for some bool inputs. (:issue:`62778`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`) From 5a72c79ddc45dbf5b2fcf0feb27c9b7dfa9599e3 Mon Sep 17 00:00:00 2001 From: Zrahay Date: Sun, 26 Oct 2025 19:03:59 +0530 Subject: [PATCH 4/8] Added the suggested changes. Removed the redundant comments and tests from test_reductions.py --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/test_reductions.py | 13 ++----------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0ca9c1a72d6ea..c227e96bb206f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1755,7 +1755,7 @@ def _cython_agg_general( **kwargs, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; - # that goes through SeriesGroupBy + # that goes through SeriesGroupBy if not is_bool(numeric_only): raise ValueError("numeric_only accepts only Boolean values") diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 519e9ca974b5c..7212f4411d7b3 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1522,13 +1522,10 @@ def test_groupby_std_datetimelike(): tm.assert_frame_equal(result, expected) def test_mean_numeric_only_validates_bool(): - """ - Test that numeric_only parameter only accepts boolean values. - See GH#62778 - """ + # GH#62778 + df = pd.DataFrame({"A": range(5), "B": range(5)}) - # These test cases should raise a ValueError msg = "numeric_only accepts only Boolean values" with pytest.raises(ValueError, match=msg): df.groupby(["A"]).mean(["B"]) @@ -1538,9 +1535,3 @@ def test_mean_numeric_only_validates_bool(): with pytest.raises(ValueError, match=msg): df.groupby(["A"]).mean(numeric_only=1) - - # These test cases should work absolutely fine - df.groupby(["A"]).mean() - df.groupby(["A"]).mean(numeric_only=True) - df.groupby(["A"]).mean(numeric_only=False) - From 8056bb863ec9a202854ebcfd1582d0e0e41c572a Mon Sep 17 00:00:00 2001 From: Zrahay Date: Sun, 26 Oct 2025 22:02:04 +0530 Subject: [PATCH 5/8] Change made to the comments in whatsnew 3.0 --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c5c9e9d589456..7e02a20452b5c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1150,7 +1150,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` would fail when the groups were :class:`Categorical` with an NA value (:issue:`61356`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) -- Bug in :meth:`.DataFrameGroupBy` reductions where boolean-valued inputs were mishandled in the Cython aggregation path (``_cython_agg_general``); adding an ``is_bool`` check fixes incorrect results for some bool inputs. (:issue:`62778`) +- Bug in :meth:`.DataFrameGroupBy` reductions where non-Boolean values were allowed for the ``numeric_only`` argument; passing a non-Boolean value will now raise (:issue:`62778`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`) From 63f84433274fc640e104caf005b142b60c452a59 Mon Sep 17 00:00:00 2001 From: Zrahay Date: Sun, 26 Oct 2025 20:46:41 +0000 Subject: [PATCH 6/8] Apply pre-commit formatting fixes --- pandas/core/groupby/groupby.py | 6 ++---- pandas/tests/groupby/test_reductions.py | 3 ++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c227e96bb206f..910268ebdfb8a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -76,6 +76,7 @@ class providing the base-class of operations. ensure_dtype_can_hold_na, ) from pandas.core.dtypes.common import ( + is_bool, is_bool_dtype, is_float_dtype, is_hashable, @@ -86,7 +87,6 @@ class providing the base-class of operations. is_object_dtype, is_scalar, is_string_dtype, - is_bool, needs_i8_conversion, pandas_dtype, ) @@ -110,9 +110,7 @@ class providing the base-class of operations. SparseArray, ) from pandas.core.arrays.string_ import StringDtype -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, -) +from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.core.base import ( PandasObject, SelectionMixin, diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 7212f4411d7b3..9c13ceec49e35 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1521,10 +1521,11 @@ def test_groupby_std_datetimelike(): expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) tm.assert_frame_equal(result, expected) + def test_mean_numeric_only_validates_bool(): # GH#62778 - df = pd.DataFrame({"A": range(5), "B": range(5)}) + df = DataFrame({"A": range(5), "B": range(5)}) msg = "numeric_only accepts only Boolean values" with pytest.raises(ValueError, match=msg): From bb9b0ab9f27cf362d38bc541b9d206261c762d7b Mon Sep 17 00:00:00 2001 From: Zrahay Date: Sun, 26 Oct 2025 21:02:54 +0000 Subject: [PATCH 7/8] Sort whatsnew entries alphabetically --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7e02a20452b5c..457ab6eb965ca 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1150,8 +1150,8 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` would fail when the groups were :class:`Categorical` with an NA value (:issue:`61356`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) -- Bug in :meth:`.DataFrameGroupBy` reductions where non-Boolean values were allowed for the ``numeric_only`` argument; passing a non-Boolean value will now raise (:issue:`62778`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) +- Bug in :meth:`.DataFrameGroupBy` reductions where non-Boolean values were allowed for the ``numeric_only`` argument; passing a non-Boolean value will now raise (:issue:`62778`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) From 8a7918fcc5e1c0f69123727b2cfada89a3a13fd2 Mon Sep 17 00:00:00 2001 From: Zrahay Date: Tue, 28 Oct 2025 22:54:35 +0530 Subject: [PATCH 8/8] Add .editorconfig for consistent editor settings Define formatting rules for .py, .pyx, .c, meson.build, YAML, JSON, .sh, and .rst files to complement existing formatters and improve developer experience. Closes #62736 --- .editorconfig | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000000000..287395138d517 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,47 @@ + +root = true + +[*] +charset = UTF-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +spelling_language = en-us + +[*.py] +max_line_length = 88 +indent_style = space +indent_size = 4 + +[*.pyx] +max_line_length = 88 +indent_style = space +indent_size = 4 + + +[*.c] +max_line_length = 80 +indent_style = space +indent_size = 2 + +[meson.build] +max_line_length = 80 +indent_style = space +indent_size = 4 + +[*.{yml,yaml}] +indent_style = space +indent_size = 4 + +[*.json] +indent_style = space +indent_size = 4 + + +[*.sh] +indent_style = space +indent_size = 4 + +[*.rst] +indent_style = space +indent_size = 4