Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,33 @@ dtype: object

When converting from higher precision numerical dtypes (like float64) to
lower precision (like float32), data precision might be compromised.

### Nested Data Types Support

pandas-pyarrow also supports automatic detection and conversion of nested data types:

```python
import pandas as pd
from pandas_pyarrow import convert_to_pyarrow, convert_to_numpy

# Create a DataFrame with list and dictionary columns
df = pd.DataFrame({
'list_col': [[1, 2, 3], [4, 5], [6, 7, 8, 9]],
'dict_col': [{'a': 1, 'b': 2}, {'c': 3}, {'d': 4, 'e': 5}]
})

# Convert to PyArrow-backed DataFrame
adf = convert_to_pyarrow(df)

# Access nested type information
converter = PandasArrowConverter()
nested_types = converter.get_nested_dtypes(adf)
print(nested_types)

# Convert back to pandas/numpy
rdf = convert_to_numpy(adf)
```

This will properly convert:
- List columns to `list[pyarrow]` type
- Dictionary columns to `struct[pyarrow]` type
5 changes: 5 additions & 0 deletions pandas_pyarrow/mappers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .datetime_mapper import datetime_mapper, reverse_datetime_mapper
from .db_types import mapper_db_types
from .dtype_mapper import mapper_dict_dt, mapper_dict_object, reverse_mapper_dict
from .nested_mapper import nested_mapper_dict, reverse_nested_mapper_dict
from .numeric_mapper import numeric_mapper, reverse_numeric_mapper


Expand All @@ -15,6 +16,7 @@ def create_mapper() -> Dict[str, str]:
**mapper_dict_dt,
**mapper_dict_object,
**mapper_db_types,
**nested_mapper_dict,
)
return all_mapper_dicts

Expand All @@ -28,6 +30,7 @@ def reverse_create_mapper(
**reverse_numeric_mapper(["uint"], ["8", "16", "32", "64"]),
**reverse_datetime_mapper(adapter=adapter),
**reverse_mapper_dict,
**reverse_nested_mapper_dict,
)
return all_mapper_dicts

Expand All @@ -40,4 +43,6 @@ def reverse_create_mapper(
"mapper_db_types",
"datetime_mapper",
"numeric_mapper",
"nested_mapper_dict",
"reverse_nested_mapper_dict",
]
12 changes: 12 additions & 0 deletions pandas_pyarrow/mappers/nested_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import Dict

# Mapper for nested data types (lists and dictionaries)
# Only define the pyarrow type names, not the raw type names
nested_mapper_dict: Dict[str, str] = {}

# Maps pyarrow types back to pandas/numpy types
reverse_nested_mapper_dict: Dict[str, str] = {
"list[pyarrow]": "object",
"struct[pyarrow]": "object",
"map[pyarrow]": "object",
}
79 changes: 77 additions & 2 deletions pandas_pyarrow/pda_converter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple, Any
import inspect

from .mappers import create_mapper

import numpy as np
import pandas as pd
import pyarrow as pa

# Global dict to track nested type columns
_NESTED_TYPE_COLUMNS = {}


class PandasArrowConverter:
Expand All @@ -11,17 +17,22 @@
Will override default mapping
:param default_target_type: Optional string specifying the default data type to use if no mapping is found for a
specific data type. Default is "string[pyarrow]".
:param detect_nested: Whether to detect and convert nested data types (lists, dictionaries) in object columns.
Default is True.

"""

def __init__(
self,
custom_mapper: Optional[Dict[str, str]] = None,
default_target_type: Optional[str] = "string[pyarrow]",
detect_nested: bool = True,
):
self.additional_mapper_dicts = custom_mapper or {}
self.defaults_dtype = default_target_type
self.detect_nested = detect_nested
self._mapper = create_mapper() | self.additional_mapper_dicts
self.nested_type_registry = {}

def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Expand All @@ -32,15 +43,79 @@
a mapping function to get the corresponding target dtypes, and applies the
mapping to create a new DataFrame with updated dtypes.

If detect_nested is True, it will also attempt to detect columns containing
lists or dictionaries and convert them to the appropriate PyArrow types.

:param df: A Pandas DataFrame whose column dtypes will be transformed.
:type df: pd.DataFrame
:return: A new Pandas DataFrame with transformed column dtypes.
:rtype: pd.DataFrame
"""
# Clear the nested type registry
self.nested_type_registry = {}

dtype_names: List[str] = df.dtypes.astype(str).tolist()
target_dtype_names = self._map_dtype_names(dtype_names)
adf = df.astype(dict(zip(df.columns, target_dtype_names)))

# Get column to dtype mapping
col_to_dtype = dict(zip(df.columns, target_dtype_names))

# Convert the DataFrame
adf = df.astype({col: dtype for col, dtype in col_to_dtype.items()})

# If detect_nested is enabled, handle nested types
if self.detect_nested:
adf = self._handle_nested_types(df, adf)

# Store the nested types registry in the DataFrame as metadata
adf._nested_types_registry = self.nested_type_registry

return adf

def _handle_nested_types(self, orig_df: pd.DataFrame, adf: pd.DataFrame) -> pd.DataFrame:
"""
Handle nested data types by using PyArrow directly

:param orig_df: Original DataFrame
:param adf: DataFrame being processed
:return: DataFrame with nested types properly handled
"""
# Find object columns to check for nested types
object_cols = [col for col in orig_df.columns if str(orig_df[col].dtype) == 'object']

for col in object_cols:
# Skip empty columns
if orig_df[col].isna().all():
continue

Check warning on line 89 in pandas_pyarrow/pda_converter.py

View check run for this annotation

Codecov / codecov/patch

pandas_pyarrow/pda_converter.py#L89

Added line #L89 was not covered by tests

# Get first non-null value to check its type
sample = orig_df[col].dropna().iloc[0] if not orig_df[col].isna().all() else None

if sample is not None:
if isinstance(sample, list):
# Convert to PyArrow and back to capture the list structure
table = pa.Table.from_pandas(orig_df[[col]])
adf[col] = table.to_pandas()[col]
# Track as a list type column
self.nested_type_registry[col] = "list[pyarrow]"

elif isinstance(sample, dict):
# Convert to PyArrow and back to capture the dict structure
table = pa.Table.from_pandas(orig_df[[col]])
adf[col] = table.to_pandas()[col]
# Track as a struct type column
self.nested_type_registry[col] = "struct[pyarrow]"

return adf

def get_nested_dtypes(self, df: pd.DataFrame) -> Dict[str, str]:
"""
Get the nested dtypes for a DataFrame that was processed by this converter

:param df: DataFrame to get nested types for
:return: Dictionary mapping column names to nested type names
"""
return getattr(df, "_nested_types_registry", {})

def _target_dtype_name(self, dtype_name: str) -> str:
type_mapper = self._mapper
Expand Down
7 changes: 5 additions & 2 deletions pandas_pyarrow/reverse_converter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
# reverse_converter.py

from typing import Dict, List, Optional

from .mappers import reverse_create_mapper

import numpy as np
import pandas as pd
import pyarrow as pa


class ReversePandasArrowConverter:
Expand Down Expand Up @@ -56,6 +55,10 @@

if "bool" in dtype_name:
return "bool"

# Handle nested types
if "list[pyarrow]" in dtype_name or "struct[pyarrow]" in dtype_name or "map[pyarrow]" in dtype_name:
return "object"

Check warning on line 61 in pandas_pyarrow/reverse_converter.py

View check run for this annotation

Codecov / codecov/patch

pandas_pyarrow/reverse_converter.py#L61

Added line #L61 was not covered by tests

return self._mapper.get(dtype_name, self._default_target_type)

Expand Down
16 changes: 16 additions & 0 deletions tests/unit/test_dtype_exists.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
import pytest


# Define nested types to skip in tests
NESTED_TYPES = ["list[pyarrow]", "struct[pyarrow]", "map[pyarrow]"]


@pytest.mark.parametrize(
"str_types",
[
Expand All @@ -15,6 +19,10 @@
)
def test_str_dtypes(str_types):
for t in str_types:
# Skip nested types that pandas dtype cannot handle
if any(nested_type in t for nested_type in NESTED_TYPES):
continue

pd_dtype = pd.api.types.pandas_dtype(t)
if "pyarrow" not in t:
assert str(pd_dtype) in str_types
Expand All @@ -29,6 +37,10 @@ def test_str_dtypes(str_types):
)
def test_str_dtypes_numpy(str_types):
for t in str_types:
# Skip nested types that pandas dtype cannot handle
if any(nested_type in t for nested_type in NESTED_TYPES):
continue

pd_dtype = pd.api.types.pandas_dtype(t)
assert str(pd_dtype) in str_types

Expand All @@ -38,6 +50,10 @@ def test_str_dtypes_pyarrow():
mapper_from_pyarrow = reverse_create_mapper(adapter="tz=")
str_types = set(mapper_from_numpy.values()).union(set(mapper_from_pyarrow.keys()))
for t in str_types:
# Skip nested types that pandas dtype cannot handle
if any(nested_type in t for nested_type in NESTED_TYPES):
continue

pd_dtype = pd.api.types.pandas_dtype(t)
pd_dtype_repr = repr(pd_dtype)
assert pd_dtype_repr in str_types
50 changes: 50 additions & 0 deletions tests/unit/test_nested_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from pandas_pyarrow import convert_to_numpy
from pandas_pyarrow.pda_converter import PandasArrowConverter

import pandas as pd
import numpy as np
import pytest
from parametrization import Parametrization


def create_df(column_values, data_type=None):
if data_type:
return pd.DataFrame({"test_column": column_values}, dtype=data_type)
return pd.DataFrame({"test_column": column_values})


@Parametrization.autodetect_parameters()
@Parametrization.case(
name="test list type detection",
df_data=create_df([[1, 2, 3], [4, 5], [6, 7, 8, 9]]),
expected_dtype="list[pyarrow]",
additional_mapper_dicts=None,
)
@Parametrization.case(
name="test dict type detection",
df_data=create_df([{'a': 1, 'b': 2}, {'c': 3}, {'d': 4, 'e': 5}]),
expected_dtype="struct[pyarrow]",
additional_mapper_dicts=None,
)
def test_nested_types(df_data, expected_dtype, additional_mapper_dicts):
converter = PandasArrowConverter(custom_mapper=additional_mapper_dicts)
adf = converter(df_data)

# Check that converter properly registered the nested type
assert converter.nested_type_registry['test_column'] == expected_dtype

# Check that the nested_types_registry is stored in the DataFrame
assert hasattr(adf, "_nested_types_registry")
assert adf._nested_types_registry['test_column'] == expected_dtype

# Use the getter method to verify the type
nested_types = converter.get_nested_dtypes(adf)
assert nested_types['test_column'] == expected_dtype

# Convert back to numpy and verify it's still object type
rdf = convert_to_numpy(adf)
assert rdf.dtypes.iloc[0] == np.dtype('O')

# Verify data is present and has the right shape
assert len(rdf) == len(df_data)
assert rdf.shape == df_data.shape
Loading