Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
Expand Down
26 changes: 26 additions & 0 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
)
from pandas.core.dtypes.inference import is_integer

from pandas.core.arrays.arrow.array import to_pyarrow_type

from pandas.io._util import arrow_table_to_pandas
from pandas.io.parsers.base_parser import ParserBase

Expand Down Expand Up @@ -139,6 +141,30 @@ def handle_warning(invalid_row) -> str:
f"f{n}" for n in self.convert_options["include_columns"]
]

if self.dtype is not None:
if isinstance(self.dtype, dict):
column_types = {}
for col, col_dtype in self.dtype.items():
source_dtype = pandas_dtype(col_dtype)

try:
target_dtype = to_pyarrow_type(source_dtype.type)
if target_dtype:
column_types[col] = target_dtype

except TypeError:
# TODO: Unsupported dtypes silently ignored - may cause
# unexpected behavior when pyarrow applies default inference
# instead of user's dtype
pass

if column_types:
self.convert_options["column_types"] = column_types
else:
# TODO: Global dtypes not supported - may cause inconsistent behavior
# between engines, especially for leading zero preservation
pass

self.read_options = {
"autogenerate_column_names": self.header is None,
"skip_rows": self.header
Expand Down
71 changes: 71 additions & 0 deletions pandas/tests/io/parser/test_preserve_leading_zeros.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from io import StringIO

import pytest


def test_leading_zeros_preserved_with_dtype_str(all_parsers, request):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't merit its own file. try to find plausibly-related tests to put it with

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seemsl ike the tests/io/parser is the right place, but I don't see any other files there that seem appropriate. could you suggest another place? happy to move it.

# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
# GH#61618: further discussion on ensuring string dtype preservation across engines

parser = all_parsers
engine_name = getattr(parser, "engine", "unknown")

data = """col1,col2,col3,col4
AB,000388907,abc,0150
CD,101044572,def,0150
EF,000023607,ghi,0205
GH,100102040,jkl,0205"""

result = parser.read_csv(
StringIO(data),
dtype=str,
)

try:
assert result.shape == (4, 4)
assert list(result.columns) == ["col1", "col2", "col3", "col4"]

assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"

except AssertionError as exc:
if engine_name == "pyarrow":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

take a look at how we handle xfails elsewhere. we check and add the marker before the meat of the test

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel I considered that option, but it doesn't seem appropriate for this case. The tests only fail for the pyarrow engine, and only because there is an underlying flaw in the pyarrow read logic. Is there another preferred way to handle this?

# Temporary workaround for GH#57666
# Remove once type preservation is fixed in pyarrow engine.
request.node.add_marker(
pytest.mark.xfail(reason=f"failed assertions: {exc}", strict=False)
)
raise


def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
# GH#61618: further discussion on ensuring string dtype preservation across engines

parser = all_parsers

data = """col1,col2,col3,col4
AB,000388907,199,0150
CD,101044572,200,0150
EF,000023607,201,0205
GH,100102040,202,0205"""

result = parser.read_csv(
StringIO(data),
dtype={"col2": str, "col3": int, "col4": str},
)

assert result.shape == (4, 4)
assert list(result.columns) == ["col1", "col2", "col3", "col4"]

assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"

assert result.loc[0, "col3"] == 199
assert result.loc[1, "col3"] == 200
assert result.loc[2, "col3"] == 201
assert result.loc[3, "col3"] == 202
Loading