Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2308,8 +2308,34 @@ def kind(self) -> str:

@cache_readonly
def itemsize(self) -> int:
"""Return the number of bytes in this dtype"""
return self.numpy_dtype.itemsize
"""
Return the number of bytes in this dtype.

For Arrow-backed dtypes:
- Returns the fixed-width bit size divided by 8 for standard fixed-width types.
- For boolean types, returns the NumPy itemsize.
- Falls back to the NumPy dtype itemsize for variable-width & unsupported types.

Examples
--------
>>> import pyarrow as pa
>>> import pandas as pd
>>> dtype = pd.ArrowDtype(pa.int32())
>>> dtype.itemsize
4

>>> dtype = pd.ArrowDtype(pa.bool_())
>>> dtype.itemsize # falls back to numpy dtype
1
"""
# Use pyarrow itemsize for fixed-width data types
# e.g. int32 -> 32 bits // 8 = 4 bytes
try:
if pa.types.is_boolean(self.pyarrow_dtype):
return self.numpy_dtype.itemsize
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move this outside the try except?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing. Do you think it would be better to also catch NotImplementedError in the except block?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

catch NotImplementedError in the except block?

Yes or experiment what exceptions .bit_width could raise on the pyarrow side

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have done the corections and checked running the CI routine on my fork, works as expected.
Waiting for the CI routine to be fixed on this repo.

return self.pyarrow_dtype.bit_width // 8
except (ValueError, AttributeError):
return self.numpy_dtype.itemsize

def construct_array_type(self) -> type_t[ArrowExtensionArray]:
"""
Expand Down
90 changes: 90 additions & 0 deletions pandas/tests/dtypes/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,3 +1259,93 @@ def test_categorical_nan_no_dtype_conversion():
expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]})
df.loc[0, "a"] = np.array([1])
tm.assert_frame_equal(df, expected)


@pytest.fixture
def pa():
return pytest.importorskip("pyarrow")


@pytest.mark.parametrize(
"type_name, expected_size",
[
# Integer types
("int8", 1),
("int16", 2),
("int32", 4),
("int64", 8),
("uint8", 1),
("uint16", 2),
("uint32", 4),
("uint64", 8),
# Floating point types
("float16", 2),
("float32", 4),
("float64", 8),
# Boolean
("bool_", 1),
# Date and timestamp types
("date32", 4),
("date64", 8),
("timestamp", 8),
# Time types
("time32", 4),
("time64", 8),
# Decimal types
("decimal128", 16),
("decimal256", 32),
],
)
def test_arrow_dtype_itemsize_fixed_width(pa, type_name, expected_size):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move this test to pandas/tests/extension/test_arrow.py?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only this particular test test_arrow_dtype_itemsize_fixed_width or all 3? I guess all 3 as they test the same functionality?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the new tests in this file

# GH 57948

parametric_type_map = {
"timestamp": pa.timestamp("ns"),
"time32": pa.time32("s"),
"time64": pa.time64("ns"),
"decimal128": pa.decimal128(38, 10),
"decimal256": pa.decimal256(76, 10),
}

if type_name in parametric_type_map:
arrow_type = parametric_type_map.get(type_name)
else:
arrow_type = getattr(pa, type_name)()
dtype = pd.ArrowDtype(arrow_type)

if type_name == "bool_":
expected_size = dtype.numpy_dtype.itemsize

assert dtype.itemsize == expected_size, (
f"{type_name} expected {expected_size}, got {dtype.itemsize} "
f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})"
)


@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"])
def test_arrow_dtype_itemsize_variable_width(pa, type_name):
# GH 57948

arrow_type = getattr(pa, type_name)()
dtype = pd.ArrowDtype(arrow_type)

assert dtype.itemsize == dtype.numpy_dtype.itemsize


def test_arrow_dtype_error_fallback(pa, monkeypatch):
# GH 57948

dtype = pd.ArrowDtype(pa.int32())

class ErrorType:
id = None

@property
def bit_width(self):
raise ValueError("Simulated Error")

def to_pandas_dtype(self):
return Series([0]).dtype

monkeypatch.setattr(dtype, "pyarrow_dtype", ErrorType())
assert dtype.itemsize == dtype.numpy_dtype.itemsize
Loading