diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ffa65032e6aae..ead60ad82f8cf 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -987,6 +987,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) +- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index ad39d0ebf4326..294cccea189ed 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -19,6 +19,8 @@ ) from pandas.core.dtypes.inference import is_integer +from pandas.core.arrays.arrow.array import to_pyarrow_type + from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase @@ -139,6 +141,30 @@ def handle_warning(invalid_row) -> str: f"f{n}" for n in self.convert_options["include_columns"] ] + if self.dtype is not None: + if isinstance(self.dtype, dict): + column_types = {} + for col, col_dtype in self.dtype.items(): + source_dtype = pandas_dtype(col_dtype) + + try: + target_dtype = to_pyarrow_type(source_dtype.type) + if target_dtype: + column_types[col] = target_dtype + + except TypeError: + # TODO: Unsupported dtypes silently ignored - may cause + # unexpected behavior when pyarrow applies default inference + # instead of user's dtype + pass + + if column_types: + self.convert_options["column_types"] = column_types + else: + # TODO: Global dtypes not supported - may cause inconsistent behavior + # between engines, especially for leading zero preservation + pass + self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py new file mode 100644 index 0000000000000..757962d06817a --- /dev/null +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -0,0 +1,71 @@ +from io import StringIO + +import pytest + + +def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): + # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed + # GH#61618: further discussion on ensuring string dtype preservation across engines + + parser = all_parsers + engine_name = getattr(parser, "engine", "unknown") + + data = """col1,col2,col3,col4 +AB,000388907,abc,0150 +CD,101044572,def,0150 +EF,000023607,ghi,0205 +GH,100102040,jkl,0205""" + + result = parser.read_csv( + StringIO(data), + dtype=str, + ) + + try: + assert result.shape == (4, 4) + assert list(result.columns) == ["col1", "col2", "col3", "col4"] + + assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0" + assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2" + assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" + assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" + + except AssertionError as exc: + if engine_name == "pyarrow": + # Temporary workaround for GH#57666 + # Remove once type preservation is fixed in pyarrow engine. + request.node.add_marker( + pytest.mark.xfail(reason=f"failed assertions: {exc}", strict=False) + ) + raise + + +def test_leading_zeros_preserved_with_dtype_dict(all_parsers): + # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed + # GH#61618: further discussion on ensuring string dtype preservation across engines + + parser = all_parsers + + data = """col1,col2,col3,col4 +AB,000388907,199,0150 +CD,101044572,200,0150 +EF,000023607,201,0205 +GH,100102040,202,0205""" + + result = parser.read_csv( + StringIO(data), + dtype={"col2": str, "col3": int, "col4": str}, + ) + + assert result.shape == (4, 4) + assert list(result.columns) == ["col1", "col2", "col3", "col4"] + + assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0" + assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2" + assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" + assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" + + assert result.loc[0, "col3"] == 199 + assert result.loc[1, "col3"] == 200 + assert result.loc[2, "col3"] == 201 + assert result.loc[3, "col3"] == 202