Skip to content

Commit 9991295

Browse files
authored
BUG: Categorical(Series[object]) not preserving categories.dtype as object (#62757)
1 parent 50bab2e commit 9991295

File tree

5 files changed

+42
-3
lines changed

5 files changed

+42
-3
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,7 @@ Bug fixes
954954

955955
Categorical
956956
^^^^^^^^^^^
957+
- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the ``categories.dtype`` is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings) (:issue:`61778`)
957958
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
958959
- Bug in :func:`testing.assert_index_equal` raising ``TypeError`` instead of ``AssertionError`` for incomparable ``CategoricalIndex`` when ``check_categorical=True`` and ``exact=False`` (:issue:`61935`)
959960
- Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`)

pandas/core/arrays/categorical.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,10 @@ def __init__(
460460
codes = arr.indices.to_numpy()
461461
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
462462
else:
463+
preserve_object = False
464+
if isinstance(values, (ABCIndex, ABCSeries)) and values.dtype == object:
465+
# GH#61778
466+
preserve_object = True
463467
if not isinstance(values, ABCIndex):
464468
# in particular RangeIndex xref test_index_equal_range_categories
465469
values = sanitize_array(values, None)
@@ -476,7 +480,14 @@ def __init__(
476480
"by passing in a categories argument."
477481
) from err
478482

479-
# we're inferring from values
483+
if preserve_object:
484+
# GH#61778 wrap categories in an Index to prevent dtype
485+
# inference in the CategoricalDtype constructor
486+
from pandas import Index
487+
488+
categories = Index(categories, dtype=object, copy=False)
489+
490+
# if not preserve_obejct, we're inferring from values
480491
dtype = CategoricalDtype(categories, dtype.ordered)
481492

482493
elif isinstance(values.dtype, CategoricalDtype):

pandas/tests/arrays/categorical/test_constructors.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,3 +809,28 @@ def test_range_values_preserves_rangeindex_categories(self, values, categories):
809809
result = Categorical(values=values, categories=categories).categories
810810
expected = RangeIndex(range(5))
811811
tm.assert_index_equal(result, expected, exact=True)
812+
813+
def test_categorical_preserve_object_dtype_from_pandas(self, using_infer_string):
814+
# GH#61778
815+
pylist = ["foo", "bar", "baz"]
816+
ser = Series(pylist, dtype="object")
817+
idx = Index(pylist, dtype="object")
818+
arr = np.array(pylist, dtype="object")
819+
820+
cat_from_ser = Categorical(ser)
821+
cat_from_idx = Categorical(idx)
822+
cat_from_arr = Categorical(arr)
823+
cat_from_list = Categorical(pylist)
824+
825+
# Series/Index with object dtype: infer string
826+
# dtype if all elements are strings
827+
assert cat_from_ser.categories.dtype == object
828+
assert cat_from_idx.categories.dtype == object
829+
830+
if using_infer_string:
831+
# Numpy array or list: infer string dtype
832+
assert cat_from_arr.categories.dtype == "str"
833+
assert cat_from_list.categories.dtype == "str"
834+
else:
835+
assert cat_from_arr.categories.dtype == object
836+
assert cat_from_list.categories.dtype == object

pandas/tests/groupby/methods/test_value_counts.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,10 @@ def test_against_frame_and_seriesgroupby(
347347
index_frame = expected.index.to_frame(index=False)
348348
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
349349
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
350+
both_dtype = index_frame["both"].dtype
351+
index_frame = index_frame.astype(
352+
{"gender": both_dtype, "education": both_dtype}
353+
)
350354
del index_frame["both"]
351355
index_frame2 = index_frame.rename({0: None}, axis=1)
352356
expected.index = MultiIndex.from_frame(index_frame2)

pandas/tests/reshape/test_pivot.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2673,8 +2673,6 @@ def test_pivot_integer_bug(self, any_string_dtype):
26732673

26742674
result = df.pivot(index=1, columns=0, values=2)
26752675
expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype)
2676-
if any_string_dtype == "object":
2677-
expected_columns = expected_columns.astype("str")
26782676
tm.assert_index_equal(result.columns, expected_columns)
26792677

26802678
def test_pivot_index_none(self):

0 commit comments

Comments
 (0)