Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 33 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -14093,18 +14093,38 @@ def to_period(
setattr(new_obj, axis_name, new_ax)
return new_obj

def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
def isin(
self,
values: Series | DataFrame | Sequence | Mapping,
ignore_index: bool = False,
) -> DataFrame:
"""
Whether each element in the DataFrame is contained in values.

Parameters
----------
values : iterable, Series, DataFrame or dict
The result will only be true at a location if all the
labels match. If `values` is a Series, that's the index. If
`values` is a dict, the keys must be the column names,
which must match. If `values` is a DataFrame,
then both the index and column labels must match.
labels match.

- If `values` is a Series, the index labels must match.
- If `values` is a dict, the keys must be column names,
which must match.
- If `values` is a DataFrame:

* When ``ignore_index=False`` (default), both the index
and column labels must match, and comparison is done
elementwise.
* When ``ignore_index=True``, only column labels must
match. Each element in the DataFrame is compared
against the set of values in the corresponding column
of ``values``, ignoring row index alignment.

ignore_index : bool, default False
*Only valid when `values` is a DataFrame.*
If True, ignore index alignment and simply check
if each value in each column occurs in the same
column of `values`.

Returns
-------
Expand All @@ -14121,8 +14141,8 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:

Notes
-----
``__iter__`` is used (and not ``__contains__``) to iterate over values
when checking if it contains the elements in DataFrame.
``__iter__`` is used (and not ``__contains__``) to iterate over values
when checking if it contains the elements in DataFrame.

Examples
--------
Expand Down Expand Up @@ -14185,9 +14205,12 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
raise ValueError("cannot compute isin with a duplicate axis.")
result = self.eq(values.reindex_like(self), axis="index")
elif isinstance(values, DataFrame):
if not (values.columns.is_unique and values.index.is_unique):
raise ValueError("cannot compute isin with a duplicate axis.")
result = self.eq(values.reindex_like(self))
if ignore_index:
result = self.isin(values.to_dict("list"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Converting a DataFrame to a Python dictionary is going to be a huge performance hit, and may introduce different comparison semantics than what pandas offers naturally. Is there a way to accomplish this using the built-in indexers, much like the not ignore_index case?

else:
if not (values.columns.is_unique and values.index.is_unique):
raise ValueError("cannot compute isin with a duplicate axis.")
result = self.eq(values.reindex_like(self))
else:
if not is_list_like(values):
raise TypeError(
Expand Down
62 changes: 62 additions & 0 deletions pandas/tests/frame/methods/test_isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,65 @@ def test_isin_not_lossy(self):
result = df.isin([val])
expected = DataFrame({"a": [True], "b": [False]})
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"values_df,expected",
[
# Case 1: Same values, different order
(
DataFrame({"A": [2, 1], "B": [0, 2]}),
DataFrame({"A": [True, True], "B": [True, True]}),
),
# Case 2: Subset of values
(
DataFrame({"A": [1], "B": [0]}),
DataFrame({"A": [True, False], "B": [True, False]}),
),
# Case 3: No matching values
(
DataFrame({"A": [5, 6], "B": [7, 8]}),
DataFrame({"A": [False, False], "B": [False, False]}),
),
# Case 4: Missing column
pytest.param(
DataFrame({"A": [1, 2]}),
DataFrame({"A": [True, True], "B": [False, False]}),
id="missing_column",
),
],
)
def test_isin_ignore_index(self, values_df, expected):
"""
Test DataFrame.isin() with ignore_index=True for various scenarios.

GH#62620
"""
df = DataFrame({"A": [1, 2], "B": [0, 2]})
result = df.isin(values_df, ignore_index=True)
tm.assert_frame_equal(result, expected)

def test_isin_ignore_index_with_duplicates(self):
"""
Test that ignore_index=True works correctly with duplicate values.

GH#62620
"""
df = DataFrame({"A": [1, 2, 3], "B": [0, 0, 0]})
values = DataFrame({"A": [1, 1, 2], "B": [0, 0, 0]})
result = df.isin(values, ignore_index=True)
expected = DataFrame({"A": [True, True, False], "B": [True, True, True]})
tm.assert_frame_equal(result, expected)

def test_isin_ignore_index_diff_indexes(self):
"""
Test that ignore_index=True correctly ignores index values.

GH#62620
"""
df = DataFrame({"A": [1, 2], "B": [0, 2]}, index=["row1", "row2"])
values = DataFrame({"A": [2, 1], "B": [2, 0]}, index=["x", "y"])
result = df.isin(values, ignore_index=True)
expected = DataFrame(
{"A": [True, True], "B": [True, True]}, index=["row1", "row2"]
)
tm.assert_frame_equal(result, expected)
Loading