-
-
Notifications
You must be signed in to change notification settings - Fork 19.2k
BUG: creating Categorical from pandas Index/Series with "object" dtype infers string #62080
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c4e1c18
e1a893d
cfa767f
c0ae870
5188b81
b63a723
0fb42cc
9216954
8f460ac
87a54fe
cddc574
e83e4f9
5ed039a
9b4b2d9
4855994
1b81162
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -454,6 +454,11 @@ def __init__( | |
| codes = arr.indices.to_numpy() | ||
| dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) | ||
| else: | ||
| # Check for pandas Series/ Index with object dtye | ||
| preserve_object_dtpe = False | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo dtpe -> dtype |
||
| if isinstance(values, (ABCSeries, ABCIndex)): | ||
| if values.dtype == "object": | ||
| preserve_object_dtpe = True | ||
| if not isinstance(values, ABCIndex): | ||
| # in particular RangeIndex xref test_index_equal_range_categories | ||
| values = sanitize_array(values, None) | ||
|
|
@@ -470,7 +475,13 @@ def __init__( | |
| "by passing in a categories argument." | ||
| ) from err | ||
|
|
||
| # we're inferring from values | ||
| # If we should preserve object dtype, force categories to object dtype | ||
| if preserve_object_dtpe: | ||
| # Only preserve object dtype if not all elements are strings | ||
| if not all(isinstance(x, str) for x in categories): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this check necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the change to always preserve object dtype for categories when constructing a Categorical from a pandas Series or Index with dtype="object" is a behavioral change that affects a wide range of pandas internals and user-facing APIs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i dont think that is necessary There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the more i look at it, the more i think this misses the point of the motivating issue. i mean, wouldn't the new test this PR adds pass on main? |
||
| from pandas import Index | ||
|
|
||
| categories = Index(categories, dtype=object, copy=False) | ||
| dtype = CategoricalDtype(categories, dtype.ordered) | ||
|
|
||
| elif isinstance(values.dtype, CategoricalDtype): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -786,3 +786,32 @@ def test_range_values_preserves_rangeindex_categories(self, values, categories): | |
| result = Categorical(values=values, categories=categories).categories | ||
| expected = RangeIndex(range(5)) | ||
| tm.assert_index_equal(result, expected, exact=True) | ||
|
|
||
| def test_categorical_preserve_object_dtype_from_pandas(self): | ||
| with pd.option_context("future.infer_string", True): | ||
| ser = Series(["foo", "bar", "baz"], dtype="object") | ||
| idx = Index(["foo", "bar", "baz"], dtype="object") | ||
| arr = np.array(["foo", "bar", "baz"], dtype="object") | ||
| pylist = ["foo", "bar", "baz"] | ||
|
|
||
| cat_from_ser = Categorical(ser) | ||
| cat_from_idx = Categorical(idx) | ||
| cat_from_arr = Categorical(arr) | ||
| cat_from_list = Categorical(pylist) | ||
|
|
||
| # Series/Index with object dtype: infer string | ||
| # dtype if all elements are strings | ||
| assert cat_from_ser.categories.inferred_type == "string" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. checking inferred_type isn't going to give us what we want. check the categories.dtype directly |
||
| assert cat_from_idx.categories.inferred_type == "string" | ||
|
|
||
| # Numpy array or list: infer string dtype | ||
| assert cat_from_arr.categories.inferred_type == "string" | ||
| assert cat_from_list.categories.inferred_type == "string" | ||
|
|
||
| # Mixed types: preserve object dtype | ||
| ser_mixed = Series(["foo", 1, None], dtype="object") | ||
| idx_mixed = Index(["foo", 1, None], dtype="object") | ||
| cat_mixed_ser = Categorical(ser_mixed) | ||
| cat_mixed_idx = Categorical(idx_mixed) | ||
| assert cat_mixed_ser.categories.dtype == "object" | ||
| assert cat_mixed_idx.categories.dtype == "object" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
issue ref at the end