- 
          
 - 
                Notifications
    
You must be signed in to change notification settings  - Fork 19.2k
 
BUG: Dataframe arithmatic operators don't work with Series using fill_value #61828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
99ae672
              4e77fb7
              eb12b34
              7e23b65
              4617108
              bca56fe
              7273396
              4493e08
              406cd15
              ad9614b
              5a64e5c
              73d168b
              13ed0d6
              3d3a2a6
              76a122e
              1d3260e
              32360a8
              30d4a07
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -8491,7 +8491,10 @@ def to_series(right): | |
| # pass dtype to avoid doing inference, which would break consistency | ||
| # with Index/Series ops | ||
| dtype = None | ||
| if getattr(right, "dtype", None) == object: | ||
| if ( | ||
| getattr(right, "dtype", None) == "object" | ||
| or getattr(right, "dtype", None) == object | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this shouldn't be necessary There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, this is here to keep the output dtype for some operations as objects. Specifically,  When adding an object to another object. I can remove it if you think adding an exception for this test case is the better option.  | 
||
| ): | ||
| # can't pass right.dtype unconditionally as that would break on e.g. | ||
| # datetime64[h] ndarray | ||
| dtype = object | ||
| 
          
            
          
           | 
    @@ -8595,27 +8598,34 @@ def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): | |
| blockwise. | ||
| """ | ||
| rvalues = series._values | ||
| if not isinstance(rvalues, np.ndarray): | ||
| # TODO(EA2D): no need to special-case with 2D EAs | ||
| if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"): | ||
| # We can losslessly+cheaply cast to ndarray | ||
| rvalues = np.asarray(rvalues) | ||
| if lib.is_np_dtype(rvalues.dtype): | ||
| # We can losslessly+cheaply cast to ndarray | ||
| # i.e. ndarray or dt64[naive], td64 | ||
| # TODO(EA2D): no need to special case with 2D EAs | ||
| rvalues = np.asarray(rvalues) | ||
| 
     | 
||
| if axis == 0: | ||
| rvalues = rvalues.reshape(-1, 1) | ||
| else: | ||
| return series | ||
| rvalues = rvalues.reshape(1, -1) | ||
| 
     | 
||
| if axis == 0: | ||
| rvalues = rvalues.reshape(-1, 1) | ||
| else: | ||
| rvalues = rvalues.reshape(1, -1) | ||
| rvalues = np.broadcast_to(rvalues, self.shape) | ||
| # pass dtype to avoid doing inference | ||
| df = self._constructor(rvalues, dtype=rvalues.dtype) | ||
| 
     | 
||
| rvalues = np.broadcast_to(rvalues, self.shape) | ||
| # pass dtype to avoid doing inference | ||
| return self._constructor( | ||
| rvalues, | ||
| index=self.index, | ||
| columns=self.columns, | ||
| dtype=rvalues.dtype, | ||
| ).__finalize__(series) | ||
| else: | ||
| # GH#61581 | ||
| if axis == 0: | ||
| df = DataFrame(dict.fromkeys(range(self.shape[1]), rvalues)) | ||
| else: | ||
| nrows = self.shape[0] | ||
| df = DataFrame( | ||
| {i: rvalues[[i]].repeat(nrows) for i in range(self.shape[1])}, | ||
| dtype=rvalues.dtype, | ||
| ) | ||
| df.index = self.index | ||
| df.columns = self.columns | ||
| return df.__finalize__(series) | ||
| 
     | 
||
| def _flex_arith_method( | ||
| self, other, op, *, axis: Axis = "columns", level=None, fill_value=None | ||
| 
        
          
        
         | 
    @@ -8625,11 +8635,6 @@ def _flex_arith_method( | |
| if self._should_reindex_frame_op(other, op, axis, fill_value, level): | ||
| return self._arith_method_with_reindex(other, op) | ||
| 
     | 
||
| if isinstance(other, Series) and fill_value is not None: | ||
| # TODO: We could allow this in cases where we end up going | ||
| # through the DataFrame path | ||
| raise NotImplementedError(f"fill_value {fill_value} not supported.") | ||
| 
     | 
||
| other = ops.maybe_prepare_scalar_for_op(other, self.shape) | ||
| self, other = self._align_for_op(other, axis, flex=True, level=level) | ||
| 
     | 
||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -244,9 +244,6 @@ def test_mul(any_string_dtype): | |
| 
     | 
||
| def test_add_strings(any_string_dtype, request): | ||
| dtype = any_string_dtype | ||
| if dtype != np.dtype(object): | ||
| mark = pytest.mark.xfail(reason="GH-28527") | ||
| request.applymarker(mark) | ||
| arr = pd.array(["a", "b", "c", "d"], dtype=dtype) | ||
| df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) | ||
| assert arr.__add__(df) is NotImplemented | ||
| 
        
          
        
         | 
    @@ -261,10 +258,15 @@ def test_add_strings(any_string_dtype, request): | |
| 
     | 
||
| 
     | 
||
| @pytest.mark.xfail(reason="GH-28527") | ||
| def test_add_frame(dtype): | ||
| def test_add_frame(request, dtype): | ||
| if dtype.storage == "python": | ||
| # Inconsistent behavior between different versions of the python engine. | ||
| # Some return correctly, some return but with the wrong dtype | ||
| # Others just fail, we are blanket failing all | ||
| mark = pytest.mark.xfail(reason="[XPASS(strict)] GH-28527") | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is confusing. There is a decorator xfailing the test. Now you're adding a second xfail with a reason being that it passes? do two xfail markers cancel out? i think the thing to do is remove the decorator and add the marker in only the actually-failing cases There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's valid, I will edit the test  | 
||
| request.node.applymarker(mark) | ||
| arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) | ||
| df = pd.DataFrame([["x", np.nan, "y", np.nan]]) | ||
| 
     | 
||
| assert arr.__add__(df) is NotImplemented | ||
| 
     | 
||
| result = arr + df | ||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -258,8 +258,7 @@ def test_astype_float(dtype, any_float_dtype): | |
| # Don't compare arrays (37974) | ||
| ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype) | ||
| result = ser.astype(any_float_dtype) | ||
| item = np.nan if isinstance(result.dtype, np.dtype) else pd.NA | ||
| expected = pd.Series([1.1, item, 3.3], dtype=any_float_dtype) | ||
| expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_dtype) | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems unrelated to this PR? sure its necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I dont remember why I made this change  | 
||
| tm.assert_series_equal(result, expected) | ||
| 
     | 
||
| 
     | 
||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -629,11 +629,43 @@ def test_arith_flex_frame_corner(self, float_frame): | |
| expected = float_frame.sort_index() * np.nan | ||
| tm.assert_frame_equal(result, expected) | ||
| 
     | 
||
| with pytest.raises(NotImplementedError, match="fill_value"): | ||
| float_frame.add(float_frame.iloc[0], fill_value=3) | ||
| @pytest.mark.parametrize("axis", [0, 1]) | ||
| def test_arith_flex_frame_fill_value_series(self, float_frame, axis): | ||
| rng = np.random.default_rng(60) | ||
| mask = rng.random(float_frame.shape) < 0.2 | ||
| left = float_frame.mask(mask) | ||
| right = left.iloc[0] | ||
| 
     | 
||
| result = left.add(right, axis=axis, fill_value=3) | ||
| 
     | 
||
| if axis == 0: # axis = index, vertical | ||
| pad_num = abs(result.shape[0] - len(right)) | ||
| mult_num = result.shape[1] | ||
| right_pad = np.pad( | ||
| right, (0, pad_num), mode="constant", constant_values=(np.nan) | ||
| ) | ||
| right_df = DataFrame( | ||
| [right_pad] * mult_num, columns=result.index, index=result.columns | ||
| ).T | ||
| 
     | 
||
| left = left.reindex_like(result) | ||
| 
     | 
||
| else: # axis = columns, horizontal | ||
| pad_num = abs(result.shape[1] - len(right)) | ||
| mult_num = result.shape[0] | ||
| right_pad = np.pad( | ||
| right, (0, pad_num), mode="constant", constant_values=(np.nan) | ||
| ) | ||
| right_df = DataFrame( | ||
| [right_pad] * mult_num, index=result.index, columns=result.columns | ||
| ) | ||
| 
     | 
||
| with pytest.raises(NotImplementedError, match="fill_value"): | ||
| float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) | ||
| left_filled = left.fillna(3) | ||
| right_filled = right_df.fillna(3) | ||
| expected = right_filled + left_filled | ||
| expected = expected.mask(expected == 6, pd.NA) | ||
| 
     | 
||
| tm.assert_frame_equal(result, expected) | ||
| 
     | 
||
| @pytest.mark.parametrize("op", ["add", "sub", "mul", "mod"]) | ||
| def test_arith_flex_series_ops(self, simple_frame, op): | ||
| 
          
            
          
           | 
    @@ -675,11 +707,21 @@ def test_arith_flex_zero_len_raises(self): | |
| df_len0 = DataFrame(columns=["A", "B"]) | ||
| df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) | ||
| 
     | 
||
| with pytest.raises(NotImplementedError, match="fill_value"): | ||
| msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'" | ||
| with pytest.raises(TypeError, match=msg): | ||
| df.add(ser_len0, fill_value="E") | ||
| 
     | 
||
| with pytest.raises(NotImplementedError, match="fill_value"): | ||
| df_len0.sub(df["A"], axis=None, fill_value=3) | ||
| result = df_len0.sub(df, axis=None, fill_value=3) | ||
| expected = DataFrame([[2, 1], [0, -1]], columns=["A", "B"]) | ||
| tm.assert_frame_equal(result, expected, check_dtype=False) | ||
| 
     | 
||
| result = df_len0.sub(df["A"], axis=0, fill_value=3) | ||
| expected = DataFrame([[2, 2], [0, 0]], columns=["A", "B"]) | ||
| tm.assert_frame_equal(result, expected, check_dtype=False) | ||
| 
     | 
||
| result = df_len0.sub(df["A"], axis=1, fill_value=3) | ||
| expected = DataFrame([], columns=["A", "B", 0, 1]) | ||
| tm.assert_frame_equal(result, expected, check_dtype=False) | ||
| 
     | 
||
| def test_flex_add_scalar_fill_value(self): | ||
| # GH#12723 | ||
| 
          
            
          
           | 
    @@ -2201,3 +2243,54 @@ def test_mixed_col_index_dtype(string_dtype_no_object): | |
| expected.columns = expected.columns.astype(string_dtype_no_object) | ||
| 
     | 
||
| tm.assert_frame_equal(result, expected) | ||
| 
     | 
||
| 
     | 
||
| dt_params = [ | ||
| (tm.ALL_INT_NUMPY_DTYPES[0], 10), | ||
| (tm.ALL_INT_EA_DTYPES[0], 10), | ||
| (tm.FLOAT_NUMPY_DTYPES[0], 4.9), | ||
| (tm.FLOAT_EA_DTYPES[0], 4.9), | ||
| ] | ||
| 
     | 
||
| axes = [0, 1] | ||
| 
     | 
||
| 
     | 
||
| @pytest.mark.parametrize( | ||
| "data_type,fill_val, axis", | ||
| [(dt, val, axis) for axis in axes for dt, val in dt_params], | ||
| ) | ||
| def test_df_mul_array_fill_value(data_type, fill_val, axis): | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i remain super-skeptical that this test (or test_arith_flex_frame_fill_value_series) are minimally-complex for what you are trying to do.  | 
||
| # GH 61581 | ||
| base_data = np.arange(12).reshape(4, 3) | ||
| df = DataFrame(base_data) | ||
| mult_list = [np.nan, 1, 5, np.nan] | ||
| mult_list = mult_list[: df.shape[axis]] | ||
| 
     | 
||
| if data_type in tm.ALL_INT_NUMPY_DTYPES: | ||
| # Numpy int type cannot represent NaN | ||
| mult_np = np.asarray(mult_list) | ||
| mult_list = np.nan_to_num(mult_np, nan=fill_val) | ||
| 
     | 
||
| mult_data = pd.array(mult_list, dtype=data_type) | ||
| 
     | 
||
| for i in range(df.shape[0]): | ||
| try: | ||
| df.iat[i, i] = np.nan | ||
| df.iat[i + 2, i] = pd.NA | ||
| except IndexError: | ||
| pass | ||
| 
     | 
||
| if axis == 0: | ||
| mult_mat = np.broadcast_to(mult_data.reshape(-1, 1), df.shape) | ||
| mask = np.isnan(mult_mat) | ||
| else: | ||
| mult_mat = np.broadcast_to(mult_data.reshape(1, -1), df.shape) | ||
| mask = np.isnan(mult_mat) | ||
| mask = df.isna().values & mask | ||
| 
     | 
||
| df_result = df.mul(mult_data, axis=axis, fill_value=fill_val) | ||
| df_expected = (df.fillna(fill_val).mul(mult_data.fillna(fill_val), axis=axis)).mask( | ||
| mask, np.nan | ||
| ) | ||
| 
     | 
||
| tm.assert_frame_equal(df_result, df_expected) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it looks like you accidentally over-wrote a different note?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like I did, will fix