From b766c159178afd0628a5096e0fb03c9e130d6768 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Tue, 2 Sep 2025 14:54:30 -0500 Subject: [PATCH 01/20] Create test_preserve_leading_zeros.py --- .../io/parser/test_preserve_leading_zeros.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 pandas/tests/io/parser/test_preserve_leading_zeros.py diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py new file mode 100644 index 0000000000000..fa0e00e41c48b --- /dev/null +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -0,0 +1,48 @@ +import pytest +from io import StringIO +import pandas._testing as tm + + +@pytest.mark.xfail(reason="Leading zeros preservation may not work consistently across all engines") +def test_leading_zeros_preserved_with_dtype_str(all_parsers): + """ + Ensure that all parser engines preserve leading zeros when dtype=str is passed. + + This test verifies that when dtype=str is specified, leading zeros in + numeric-looking strings are preserved across all available parser engines. + """ + parser = all_parsers + engine_name = getattr(parser, 'engine', 'unknown') + + data = """col1|col2|col3|col4 +AB|000388907|abc|0150 +CD|101044572|def|0150 +EF|000023607|ghi|0205 +GH|100102040|jkl|0205""" + + result = parser.read_csv( + StringIO(data), + sep="|", + dtype=str, + ) + + # Verify leading zeros are preserved in col2 + assert result.loc[0, "col2"] == "000388907", f"Engine {engine_name}: Leading zeros lost in col2, row 0. Got: {result.loc[0, 'col2']}" + assert result.loc[2, "col2"] == "000023607", f"Engine {engine_name}: Leading zeros lost in col2, row 2. Got: {result.loc[2, 'col2']}" + + # Verify leading zeros are preserved in col4 + assert result.loc[0, "col4"] == "0150", f"Engine {engine_name}: Leading zeros lost in col4, row 0. Got: {result.loc[0, 'col4']}" + assert result.loc[2, "col4"] == "0205", f"Engine {engine_name}: Leading zeros lost in col4, row 2. Got: {result.loc[2, 'col4']}" + + # Verify all columns are string type + assert result.dtypes["col1"] == "object", f"Engine {engine_name}: col1 should be string type, got {result.dtypes['col1']}" + assert result.dtypes["col2"] == "object", f"Engine {engine_name}: col2 should be string type, got {result.dtypes['col2']}" + assert result.dtypes["col3"] == "object", f"Engine {engine_name}: col3 should be string type, got {result.dtypes['col3']}" + assert result.dtypes["col4"] == "object", f"Engine {engine_name}: col4 should be string type, got {result.dtypes['col4']}" + + # Verify shape + assert result.shape == (4, 4), f"Engine {engine_name}: Expected shape (4, 4), got {result.shape}" + + # Verify column names + expected_columns = ["col1", "col2", "col3", "col4"] + assert list(result.columns) == expected_columns, f"Engine {engine_name}: Expected columns {expected_columns}, got {list(result.columns)}" From ea08043c5a62f68d8a4571b2619e7b9c7807bc1d Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Tue, 2 Sep 2025 16:03:08 -0500 Subject: [PATCH 02/20] Update test_preserve_leading_zeros.py --- .../io/parser/test_preserve_leading_zeros.py | 69 ++++++++++++------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index fa0e00e41c48b..e4254df70b646 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -1,48 +1,71 @@ -import pytest from io import StringIO -import pandas._testing as tm + +import pytest -@pytest.mark.xfail(reason="Leading zeros preservation may not work consistently across all engines") +@pytest.mark.xfail( + condition=getattr(all_parsers, "engine", "") == "pyarrow", + reason="pyarrow engine strips leading zeros even with dtype=str", +) def test_leading_zeros_preserved_with_dtype_str(all_parsers): """ Ensure that all parser engines preserve leading zeros when dtype=str is passed. - - This test verifies that when dtype=str is specified, leading zeros in + + This test verifies that when dtype=str is specified, leading zeros in numeric-looking strings are preserved across all available parser engines. """ parser = all_parsers - engine_name = getattr(parser, 'engine', 'unknown') - + engine_name = getattr(parser, "engine", "unknown") + data = """col1|col2|col3|col4 AB|000388907|abc|0150 CD|101044572|def|0150 EF|000023607|ghi|0205 GH|100102040|jkl|0205""" - + result = parser.read_csv( StringIO(data), sep="|", dtype=str, ) - + # Verify leading zeros are preserved in col2 - assert result.loc[0, "col2"] == "000388907", f"Engine {engine_name}: Leading zeros lost in col2, row 0. Got: {result.loc[0, 'col2']}" - assert result.loc[2, "col2"] == "000023607", f"Engine {engine_name}: Leading zeros lost in col2, row 2. Got: {result.loc[2, 'col2']}" - + assert result.loc[0, "col2"] == "000388907", ( + f"Engine {engine_name}: Leading zeros lost in col2, row 0. Got: {result.loc[0, 'col2']}" + ) + assert result.loc[2, "col2"] == "000023607", ( + f"Engine {engine_name}: Leading zeros lost in col2, row 2. Got: {result.loc[2, 'col2']}" + ) + # Verify leading zeros are preserved in col4 - assert result.loc[0, "col4"] == "0150", f"Engine {engine_name}: Leading zeros lost in col4, row 0. Got: {result.loc[0, 'col4']}" - assert result.loc[2, "col4"] == "0205", f"Engine {engine_name}: Leading zeros lost in col4, row 2. Got: {result.loc[2, 'col4']}" - + assert result.loc[0, "col4"] == "0150", ( + f"Engine {engine_name}: Leading zeros lost in col4, row 0. Got: {result.loc[0, 'col4']}" + ) + assert result.loc[2, "col4"] == "0205", ( + f"Engine {engine_name}: Leading zeros lost in col4, row 2. Got: {result.loc[2, 'col4']}" + ) + # Verify all columns are string type - assert result.dtypes["col1"] == "object", f"Engine {engine_name}: col1 should be string type, got {result.dtypes['col1']}" - assert result.dtypes["col2"] == "object", f"Engine {engine_name}: col2 should be string type, got {result.dtypes['col2']}" - assert result.dtypes["col3"] == "object", f"Engine {engine_name}: col3 should be string type, got {result.dtypes['col3']}" - assert result.dtypes["col4"] == "object", f"Engine {engine_name}: col4 should be string type, got {result.dtypes['col4']}" - + assert result.dtypes["col1"] == "object", ( + f"Engine {engine_name}: col1 should be string type, got {result.dtypes['col1']}" + ) + assert result.dtypes["col2"] == "object", ( + f"Engine {engine_name}: col2 should be string type, got {result.dtypes['col2']}" + ) + assert result.dtypes["col3"] == "object", ( + f"Engine {engine_name}: col3 should be string type, got {result.dtypes['col3']}" + ) + assert result.dtypes["col4"] == "object", ( + f"Engine {engine_name}: col4 should be string type, got {result.dtypes['col4']}" + ) + # Verify shape - assert result.shape == (4, 4), f"Engine {engine_name}: Expected shape (4, 4), got {result.shape}" - + assert result.shape == (4, 4), ( + f"Engine {engine_name}: Expected shape (4, 4), got {result.shape}" + ) + # Verify column names expected_columns = ["col1", "col2", "col3", "col4"] - assert list(result.columns) == expected_columns, f"Engine {engine_name}: Expected columns {expected_columns}, got {list(result.columns)}" + assert list(result.columns) == expected_columns, ( + f"Engine {engine_name}: Expected columns {expected_columns}, got {list(result.columns)}" + ) From 206cb69c007681c7e5e3f6a17898bafce4bc63f3 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Tue, 2 Sep 2025 16:34:47 -0500 Subject: [PATCH 03/20] Update test_preserve_leading_zeros.py --- .../io/parser/test_preserve_leading_zeros.py | 71 +++++++------------ 1 file changed, 27 insertions(+), 44 deletions(-) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index e4254df70b646..24cd353955bf6 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -3,10 +3,6 @@ import pytest -@pytest.mark.xfail( - condition=getattr(all_parsers, "engine", "") == "pyarrow", - reason="pyarrow engine strips leading zeros even with dtype=str", -) def test_leading_zeros_preserved_with_dtype_str(all_parsers): """ Ensure that all parser engines preserve leading zeros when dtype=str is passed. @@ -29,43 +25,30 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers): dtype=str, ) - # Verify leading zeros are preserved in col2 - assert result.loc[0, "col2"] == "000388907", ( - f"Engine {engine_name}: Leading zeros lost in col2, row 0. Got: {result.loc[0, 'col2']}" - ) - assert result.loc[2, "col2"] == "000023607", ( - f"Engine {engine_name}: Leading zeros lost in col2, row 2. Got: {result.loc[2, 'col2']}" - ) - - # Verify leading zeros are preserved in col4 - assert result.loc[0, "col4"] == "0150", ( - f"Engine {engine_name}: Leading zeros lost in col4, row 0. Got: {result.loc[0, 'col4']}" - ) - assert result.loc[2, "col4"] == "0205", ( - f"Engine {engine_name}: Leading zeros lost in col4, row 2. Got: {result.loc[2, 'col4']}" - ) - - # Verify all columns are string type - assert result.dtypes["col1"] == "object", ( - f"Engine {engine_name}: col1 should be string type, got {result.dtypes['col1']}" - ) - assert result.dtypes["col2"] == "object", ( - f"Engine {engine_name}: col2 should be string type, got {result.dtypes['col2']}" - ) - assert result.dtypes["col3"] == "object", ( - f"Engine {engine_name}: col3 should be string type, got {result.dtypes['col3']}" - ) - assert result.dtypes["col4"] == "object", ( - f"Engine {engine_name}: col4 should be string type, got {result.dtypes['col4']}" - ) - - # Verify shape - assert result.shape == (4, 4), ( - f"Engine {engine_name}: Expected shape (4, 4), got {result.shape}" - ) - - # Verify column names - expected_columns = ["col1", "col2", "col3", "col4"] - assert list(result.columns) == expected_columns, ( - f"Engine {engine_name}: Expected columns {expected_columns}, got {list(result.columns)}" - ) + try: + assert result.loc[0, "col2"] == "000388907", ( + f"{engine_name} lost zeros in col2 row 0" + ) + assert result.loc[2, "col2"] == "000023607", ( + f"{engine_name} lost zeros in col2 row 2" + ) + assert result.loc[0, "col4"] == "0150", ( + f"{engine_name} lost zeros in col4 row 0" + ) + assert result.loc[2, "col4"] == "0205", ( + f"{engine_name} lost zeros in col4 row 2" + ) + + for col in ["col1", "col2", "col3", "col4"]: + assert result.dtypes[col] == "object", ( + f"{engine_name} wrong dtype for {col}" + ) + + assert result.shape == (4, 4) + assert list(result.columns) == ["col1", "col2", "col3", "col4"] + except AssertionError as exc: + if engine_name == "pyarrow": + # Known issue: pyarrow engine strips leading zeros even with dtype=str. + pytest.xfail(f"failed assertions: {exc}") + else: + raise From 247d514d6bb6e7fcf8e30b23f75666232584ed77 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Tue, 2 Sep 2025 16:56:20 -0500 Subject: [PATCH 04/20] Update test_preserve_leading_zeros.py --- .../io/parser/test_preserve_leading_zeros.py | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index 24cd353955bf6..44ecaaa260311 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -3,7 +3,7 @@ import pytest -def test_leading_zeros_preserved_with_dtype_str(all_parsers): +def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): """ Ensure that all parser engines preserve leading zeros when dtype=str is passed. @@ -26,29 +26,25 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers): ) try: - assert result.loc[0, "col2"] == "000388907", ( - f"{engine_name} lost zeros in col2 row 0" - ) - assert result.loc[2, "col2"] == "000023607", ( - f"{engine_name} lost zeros in col2 row 2" - ) - assert result.loc[0, "col4"] == "0150", ( - f"{engine_name} lost zeros in col4 row 0" - ) - assert result.loc[2, "col4"] == "0205", ( - f"{engine_name} lost zeros in col4 row 2" - ) + assert result.shape == (4, 4) + assert list(result.columns) == ["col1", "col2", "col3", "col4"] + + assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0" + assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2" + assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" + assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" for col in ["col1", "col2", "col3", "col4"]: assert result.dtypes[col] == "object", ( f"{engine_name} wrong dtype for {col}" ) - assert result.shape == (4, 4) - assert list(result.columns) == ["col1", "col2", "col3", "col4"] except AssertionError as exc: if engine_name == "pyarrow": # Known issue: pyarrow engine strips leading zeros even with dtype=str. - pytest.xfail(f"failed assertions: {exc}") + request.node.add_marker( + pytest.mark.xfail(reason=f"failed assertions: {exc}", strict=False) + ) + assert False, "trigger xfail for pyarrow" else: raise From 3e9f04e1cb23108c453ce40ee0bf624659cbbfca Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Tue, 2 Sep 2025 17:17:45 -0500 Subject: [PATCH 05/20] Update test_preserve_leading_zeros.py --- pandas/tests/io/parser/test_preserve_leading_zeros.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index 44ecaaa260311..999cc58235e01 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -34,11 +34,6 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" - for col in ["col1", "col2", "col3", "col4"]: - assert result.dtypes[col] == "object", ( - f"{engine_name} wrong dtype for {col}" - ) - except AssertionError as exc: if engine_name == "pyarrow": # Known issue: pyarrow engine strips leading zeros even with dtype=str. From c86f33fb09b4dd9584a92c6c77e8101de0d522f2 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 08:26:29 -0500 Subject: [PATCH 06/20] Update test_preserve_leading_zeros.py --- pandas/tests/io/parser/test_preserve_leading_zeros.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index 999cc58235e01..980e0daef60aa 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -39,7 +39,4 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): # Known issue: pyarrow engine strips leading zeros even with dtype=str. request.node.add_marker( pytest.mark.xfail(reason=f"failed assertions: {exc}", strict=False) - ) - assert False, "trigger xfail for pyarrow" - else: - raise + raise From 81b50db8e0a6c650878b2f2e888f0f4134656ba5 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 08:37:00 -0500 Subject: [PATCH 07/20] Update test_preserve_leading_zeros.py --- pandas/tests/io/parser/test_preserve_leading_zeros.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index 980e0daef60aa..e06ea558608ab 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -39,4 +39,5 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): # Known issue: pyarrow engine strips leading zeros even with dtype=str. request.node.add_marker( pytest.mark.xfail(reason=f"failed assertions: {exc}", strict=False) + ) raise From 599040000326dbac9f6e5500f4a53a13a3af791a Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 10:52:35 -0500 Subject: [PATCH 08/20] Update test_preserve_leading_zeros.py --- .../io/parser/test_preserve_leading_zeros.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index e06ea558608ab..1bf2810e3834d 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -4,24 +4,20 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): - """ - Ensure that all parser engines preserve leading zeros when dtype=str is passed. + # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed + # GH#61618: further discussion on ensuring string dtype preservation across engines - This test verifies that when dtype=str is specified, leading zeros in - numeric-looking strings are preserved across all available parser engines. - """ parser = all_parsers engine_name = getattr(parser, "engine", "unknown") - data = """col1|col2|col3|col4 -AB|000388907|abc|0150 -CD|101044572|def|0150 -EF|000023607|ghi|0205 -GH|100102040|jkl|0205""" + data = """col1,col2,col3,col4 +AB,000388907,abc,0150 +CD,101044572,def,0150 +EF,000023607,ghi,0205 +GH,100102040,jkl,0205""" result = parser.read_csv( StringIO(data), - sep="|", dtype=str, ) @@ -36,7 +32,8 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): except AssertionError as exc: if engine_name == "pyarrow": - # Known issue: pyarrow engine strips leading zeros even with dtype=str. + # Temporary workaround for GH#57666 + # Remove once type preservation is fixed in pyarrow engine. request.node.add_marker( pytest.mark.xfail(reason=f"failed assertions: {exc}", strict=False) ) From d9f6983142487cd221827393637a68b58b8eae52 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 14:48:59 -0500 Subject: [PATCH 09/20] draft column_types parsing --- pandas/io/parsers/arrow_parser_wrapper.py | 25 +++++++++++++++ .../io/parser/test_preserve_leading_zeros.py | 31 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index ad39d0ebf4326..24c0c05cdbabd 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -15,6 +15,7 @@ ) from pandas.core.dtypes.common import ( + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.inference import is_integer @@ -60,6 +61,8 @@ def _get_pyarrow_options(self) -> None: """ Rename some arguments to pass to pyarrow """ + pa = import_optional_dependency("pyarrow") + mapping = { "usecols": "include_columns", "na_values": "null_values", @@ -139,6 +142,28 @@ def handle_warning(invalid_row) -> str: f"f{n}" for n in self.convert_options["include_columns"] ] + if self.dtype is not None: + if isinstance(self.dtype, dict): + column_types = {} + for col, col_dtype in self.dtype.items(): + if is_string_dtype(col_dtype): + column_types[col] = pa.string() + else: + warnings.warn( + f"Column '{col}' has dtype '{col_dtype}', " + "which may not be handled correctly by the pyarrow engine.", + ParserWarning, + stacklevel=find_stack_level(), + ) + if column_types: + self.convert_options["column_types"] = column_types + else: + warnings.warn( + "The pyarrow engine expects a dict mapping columns to types.", + ParserWarning, + stacklevel=find_stack_level(), + ) + self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index 1bf2810e3834d..31cc3e82d1439 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -38,3 +38,34 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): pytest.mark.xfail(reason=f"failed assertions: {exc}", strict=False) ) raise + + +def test_leading_zeros_preserved_with_dtype_dict(all_parsers): + # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed + # GH#61618: further discussion on ensuring string dtype preservation across engines + + parser = all_parsers + + data = """col1,col2,col3,col4 +AB,000388907,199,0150 +CD,101044572,200,0150 +EF,000023607,201,0205 +GH,100102040,202,0205""" + + result = parser.read_csv( + StringIO(data), + dtype={"col2": str, "col4": str}, + ) + + assert result.shape == (4, 4) + assert list(result.columns) == ["col1", "col2", "col3", "col4"] + + assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0" + assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2" + assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" + assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" + + assert result.loc[0, "col3"] == 199 + assert result.loc[1, "col3"] == 200 + assert result.loc[2, "col3"] == 201 + assert result.loc[3, "col3"] == 202 From 0ebca38c26beb0857f7900e39d8ed99082c3d6b7 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 15:15:04 -0500 Subject: [PATCH 10/20] update tests --- .../io/parser/test_preserve_leading_zeros.py | 63 +++++++++++++++++-- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index 31cc3e82d1439..7c2b6961df3b6 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -2,6 +2,9 @@ import pytest +import pandas._testing as tm +from pandas.errors import ParserWarning + def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed @@ -16,10 +19,19 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): EF,000023607,ghi,0205 GH,100102040,jkl,0205""" - result = parser.read_csv( - StringIO(data), - dtype=str, - ) + if engine_name == "pyarrow": + with tm.assert_produces_warning( + ParserWarning, match="pyarrow engine expects a dict mapping" + ): + result = parser.read_csv( + StringIO(data), + dtype=str, + ) + else: + result = parser.read_csv( + StringIO(data), + dtype=str, + ) try: assert result.shape == (4, 4) @@ -40,7 +52,7 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): raise -def test_leading_zeros_preserved_with_dtype_dict(all_parsers): +def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers): # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed # GH#61618: further discussion on ensuring string dtype preservation across engines @@ -69,3 +81,44 @@ def test_leading_zeros_preserved_with_dtype_dict(all_parsers): assert result.loc[1, "col3"] == 200 assert result.loc[2, "col3"] == 201 assert result.loc[3, "col3"] == 202 + + +def test_leading_zeros_preserved_with_heterogeneous_dtypes(all_parsers): + # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed + # GH#61618: further discussion on ensuring string dtype preservation across engines + + parser = all_parsers + engine_name = getattr(parser, "engine", "unknown") + + data = """col1,col2,col3,col4 +AB,000388907,199,0150 +CD,101044572,200,0150 +EF,000023607,201,0205 +GH,100102040,202,0205""" + + if engine_name == "pyarrow": + with tm.assert_produces_warning( + ParserWarning, match="may not be handled correctly by the pyarrow engine" + ): + result = parser.read_csv( + StringIO(data), + dtype={"col2": str, "col3": int, "col4": str}, + ) + else: + result = parser.read_csv( + StringIO(data), + dtype={"col2": str, "col3": int, "col4": str}, + ) + + assert result.shape == (4, 4) + assert list(result.columns) == ["col1", "col2", "col3", "col4"] + + assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0" + assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2" + assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" + assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" + + assert result.loc[0, "col3"] == 199 + assert result.loc[1, "col3"] == 200 + assert result.loc[2, "col3"] == 201 + assert result.loc[3, "col3"] == 202 From 22bd4e370c7f020bb131d1faddf47371532580a5 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 15:20:24 -0500 Subject: [PATCH 11/20] Update v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ffa65032e6aae..c3812b4e0a151 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -987,6 +987,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) +- Bug in :meth:`read_csv`` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) From 22e41298dc0162d2571763659d3296cfd1aa1070 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 15:23:15 -0500 Subject: [PATCH 12/20] Update test_preserve_leading_zeros.py --- pandas/tests/io/parser/test_preserve_leading_zeros.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index 7c2b6961df3b6..3cc53a0218d80 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -2,9 +2,10 @@ import pytest -import pandas._testing as tm from pandas.errors import ParserWarning +import pandas._testing as tm + def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed From 74f01a7fde57076fa272ec0cba156a8a5ca5cf6e Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 16:20:04 -0500 Subject: [PATCH 13/20] use numpy dtype mapping --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/parsers/arrow_parser_wrapper.py | 18 ++++--- .../io/parser/test_preserve_leading_zeros.py | 49 ++----------------- 3 files changed, 16 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c3812b4e0a151..ead60ad82f8cf 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -987,7 +987,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) -- Bug in :meth:`read_csv`` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`) +- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 24c0c05cdbabd..4651c27d9ddbf 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -6,6 +6,7 @@ from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( + DtypeWarning, Pandas4Warning, ParserError, ParserWarning, @@ -15,7 +16,6 @@ ) from pandas.core.dtypes.common import ( - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.inference import is_integer @@ -146,21 +146,25 @@ def handle_warning(invalid_row) -> str: if isinstance(self.dtype, dict): column_types = {} for col, col_dtype in self.dtype.items(): - if is_string_dtype(col_dtype): - column_types[col] = pa.string() - else: + try: + numpy_dtype = pandas_dtype(col_dtype).type + pyarrow_dtype = pa.from_numpy_dtype(numpy_dtype) + column_types[col] = pyarrow_dtype + except (TypeError, ValueError, pa.ArrowNotImplementedError): warnings.warn( f"Column '{col}' has dtype '{col_dtype}', " "which may not be handled correctly by the pyarrow engine.", - ParserWarning, + DtypeWarning, stacklevel=find_stack_level(), ) + if column_types: self.convert_options["column_types"] = column_types else: warnings.warn( - "The pyarrow engine expects a dict mapping columns to types.", - ParserWarning, + f"Global dtype '{self.dtype}' not supported with pyarrow engine. " + "Use dtype dictionary instead.", + DtypeWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index 3cc53a0218d80..a1f460d7e47ee 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -2,7 +2,7 @@ import pytest -from pandas.errors import ParserWarning +from pandas.errors import DtypeWarning import pandas._testing as tm @@ -22,7 +22,7 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): if engine_name == "pyarrow": with tm.assert_produces_warning( - ParserWarning, match="pyarrow engine expects a dict mapping" + DtypeWarning, match="not supported with pyarrow engine" ): result = parser.read_csv( StringIO(data), @@ -53,7 +53,7 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): raise -def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers): +def test_leading_zeros_preserved_with_dtype_dict(all_parsers): # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed # GH#61618: further discussion on ensuring string dtype preservation across engines @@ -67,7 +67,7 @@ def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers): result = parser.read_csv( StringIO(data), - dtype={"col2": str, "col4": str}, + dtype={"col2": str, "col3": int, "col4": str}, ) assert result.shape == (4, 4) @@ -82,44 +82,3 @@ def test_leading_zeros_preserved_with_dtype_dict_str_only(all_parsers): assert result.loc[1, "col3"] == 200 assert result.loc[2, "col3"] == 201 assert result.loc[3, "col3"] == 202 - - -def test_leading_zeros_preserved_with_heterogeneous_dtypes(all_parsers): - # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed - # GH#61618: further discussion on ensuring string dtype preservation across engines - - parser = all_parsers - engine_name = getattr(parser, "engine", "unknown") - - data = """col1,col2,col3,col4 -AB,000388907,199,0150 -CD,101044572,200,0150 -EF,000023607,201,0205 -GH,100102040,202,0205""" - - if engine_name == "pyarrow": - with tm.assert_produces_warning( - ParserWarning, match="may not be handled correctly by the pyarrow engine" - ): - result = parser.read_csv( - StringIO(data), - dtype={"col2": str, "col3": int, "col4": str}, - ) - else: - result = parser.read_csv( - StringIO(data), - dtype={"col2": str, "col3": int, "col4": str}, - ) - - assert result.shape == (4, 4) - assert list(result.columns) == ["col1", "col2", "col3", "col4"] - - assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0" - assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2" - assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" - assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" - - assert result.loc[0, "col3"] == 199 - assert result.loc[1, "col3"] == 200 - assert result.loc[2, "col3"] == 201 - assert result.loc[3, "col3"] == 202 From 9d68f91e6366bc55c0cea6f267cbfd2953030752 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 17:27:58 -0500 Subject: [PATCH 14/20] change warnings behavior --- pandas/io/parsers/arrow_parser_wrapper.py | 24 +++++++++---------- .../io/parser/test_preserve_leading_zeros.py | 21 ++++------------ 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 4651c27d9ddbf..8b1970ef86ceb 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -6,7 +6,6 @@ from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( - DtypeWarning, Pandas4Warning, ParserError, ParserWarning, @@ -146,27 +145,26 @@ def handle_warning(invalid_row) -> str: if isinstance(self.dtype, dict): column_types = {} for col, col_dtype in self.dtype.items(): + # TODO: Category dtypes are not currently handled - may cause issues + # with categorical data preservation in pyarrow engine + if col_dtype == "category": + continue + try: numpy_dtype = pandas_dtype(col_dtype).type pyarrow_dtype = pa.from_numpy_dtype(numpy_dtype) column_types[col] = pyarrow_dtype except (TypeError, ValueError, pa.ArrowNotImplementedError): - warnings.warn( - f"Column '{col}' has dtype '{col_dtype}', " - "which may not be handled correctly by the pyarrow engine.", - DtypeWarning, - stacklevel=find_stack_level(), - ) + # TODO: Unsupported dtypes silently ignored - may cause unexpected + # behavior when pyarrow applies default inference instead of user's dtype + continue if column_types: self.convert_options["column_types"] = column_types else: - warnings.warn( - f"Global dtype '{self.dtype}' not supported with pyarrow engine. " - "Use dtype dictionary instead.", - DtypeWarning, - stacklevel=find_stack_level(), - ) + # TODO: Global dtypes not supported - may cause inconsistent behavior + # between engines, especially for leading zero preservation + pass self.read_options = { "autogenerate_column_names": self.header is None, diff --git a/pandas/tests/io/parser/test_preserve_leading_zeros.py b/pandas/tests/io/parser/test_preserve_leading_zeros.py index a1f460d7e47ee..757962d06817a 100644 --- a/pandas/tests/io/parser/test_preserve_leading_zeros.py +++ b/pandas/tests/io/parser/test_preserve_leading_zeros.py @@ -2,10 +2,6 @@ import pytest -from pandas.errors import DtypeWarning - -import pandas._testing as tm - def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed @@ -20,19 +16,10 @@ def test_leading_zeros_preserved_with_dtype_str(all_parsers, request): EF,000023607,ghi,0205 GH,100102040,jkl,0205""" - if engine_name == "pyarrow": - with tm.assert_produces_warning( - DtypeWarning, match="not supported with pyarrow engine" - ): - result = parser.read_csv( - StringIO(data), - dtype=str, - ) - else: - result = parser.read_csv( - StringIO(data), - dtype=str, - ) + result = parser.read_csv( + StringIO(data), + dtype=str, + ) try: assert result.shape == (4, 4) From a324f4aceaf9709b3adfbabc401b3ac33555228c Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 17:32:53 -0500 Subject: [PATCH 15/20] Update arrow_parser_wrapper.py --- pandas/io/parsers/arrow_parser_wrapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 8b1970ef86ceb..4fdeec95b5a2a 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -156,7 +156,8 @@ def handle_warning(invalid_row) -> str: column_types[col] = pyarrow_dtype except (TypeError, ValueError, pa.ArrowNotImplementedError): # TODO: Unsupported dtypes silently ignored - may cause unexpected - # behavior when pyarrow applies default inference instead of user's dtype + # behavior when pyarrow applies default inference instead of + # user's dtype continue if column_types: From 8cda906fdbdedb9330e565d21110050721d6f8c1 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 17:40:03 -0500 Subject: [PATCH 16/20] Update arrow_parser_wrapper.py --- pandas/io/parsers/arrow_parser_wrapper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 4fdeec95b5a2a..0147203b4ceab 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -155,9 +155,9 @@ def handle_warning(invalid_row) -> str: pyarrow_dtype = pa.from_numpy_dtype(numpy_dtype) column_types[col] = pyarrow_dtype except (TypeError, ValueError, pa.ArrowNotImplementedError): - # TODO: Unsupported dtypes silently ignored - may cause unexpected - # behavior when pyarrow applies default inference instead of - # user's dtype + # TODO: Unsupported dtypes silently ignored - may cause + # unexpected behavior when pyarrow applies default inference + # instead of user's dtype continue if column_types: From df9e96d23cb327276b5e70f713173a9793bf26af Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Wed, 3 Sep 2025 19:15:11 -0500 Subject: [PATCH 17/20] Update arrow_parser_wrapper.py --- pandas/io/parsers/arrow_parser_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 0147203b4ceab..26228607882d2 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -154,7 +154,7 @@ def handle_warning(invalid_row) -> str: numpy_dtype = pandas_dtype(col_dtype).type pyarrow_dtype = pa.from_numpy_dtype(numpy_dtype) column_types[col] = pyarrow_dtype - except (TypeError, ValueError, pa.ArrowNotImplementedError): + except (ValueError, pa.ArrowNotImplementedError): # TODO: Unsupported dtypes silently ignored - may cause # unexpected behavior when pyarrow applies default inference # instead of user's dtype From d2925bcf51d7820f798691d20fc1d971739b6ea1 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Thu, 4 Sep 2025 08:35:20 -0500 Subject: [PATCH 18/20] use native to_pyarrow_type --- pandas/io/parsers/arrow_parser_wrapper.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 26228607882d2..d361c0b8f6952 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -19,6 +19,8 @@ ) from pandas.core.dtypes.inference import is_integer +from pandas.core.arrays.arrow.array import to_pyarrow_type + from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase @@ -145,20 +147,18 @@ def handle_warning(invalid_row) -> str: if isinstance(self.dtype, dict): column_types = {} for col, col_dtype in self.dtype.items(): - # TODO: Category dtypes are not currently handled - may cause issues - # with categorical data preservation in pyarrow engine - if col_dtype == "category": - continue + source_dtype = pandas_dtype(col_dtype) try: - numpy_dtype = pandas_dtype(col_dtype).type - pyarrow_dtype = pa.from_numpy_dtype(numpy_dtype) - column_types[col] = pyarrow_dtype - except (ValueError, pa.ArrowNotImplementedError): + target_dtype = to_pyarrow_type(source_dtype) + if target_dtype: + column_types[col] = target_dtype + + except TypeError: # TODO: Unsupported dtypes silently ignored - may cause # unexpected behavior when pyarrow applies default inference # instead of user's dtype - continue + pass if column_types: self.convert_options["column_types"] = column_types From d6f3a5c648a2ec720e6f1e1021a487465b38b6a5 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Thu, 4 Sep 2025 08:50:55 -0500 Subject: [PATCH 19/20] remove unused dep --- pandas/io/parsers/arrow_parser_wrapper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index d361c0b8f6952..fcf50ded43fad 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -62,8 +62,6 @@ def _get_pyarrow_options(self) -> None: """ Rename some arguments to pass to pyarrow """ - pa = import_optional_dependency("pyarrow") - mapping = { "usecols": "include_columns", "na_values": "null_values", From 5df2f20681a6a648d494f4fabbf1afbac864c871 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Thu, 4 Sep 2025 10:07:22 -0500 Subject: [PATCH 20/20] Update arrow_parser_wrapper.py --- pandas/io/parsers/arrow_parser_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index fcf50ded43fad..294cccea189ed 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -148,7 +148,7 @@ def handle_warning(invalid_row) -> str: source_dtype = pandas_dtype(col_dtype) try: - target_dtype = to_pyarrow_type(source_dtype) + target_dtype = to_pyarrow_type(source_dtype.type) if target_dtype: column_types[col] = target_dtype