DanielAvdar · eh-main-bot · May 25, 2025 · May 25, 2025
diff --git a/README.md b/README.md
@@ -234,3 +234,33 @@ dtype: object
 
 When converting from higher precision numerical dtypes (like float64) to
 lower precision (like float32), data precision might be compromised.
+
+### Nested Data Types Support
+
+pandas-pyarrow also supports automatic detection and conversion of nested data types:
+
+```python
+import pandas as pd
+from pandas_pyarrow import convert_to_pyarrow, convert_to_numpy
+
+# Create a DataFrame with list and dictionary columns
+df = pd.DataFrame({
+    'list_col': [[1, 2, 3], [4, 5], [6, 7, 8, 9]],
+    'dict_col': [{'a': 1, 'b': 2}, {'c': 3}, {'d': 4, 'e': 5}]
+})
+
+# Convert to PyArrow-backed DataFrame
+adf = convert_to_pyarrow(df)
+
+# Access nested type information
+converter = PandasArrowConverter()
+nested_types = converter.get_nested_dtypes(adf)
+print(nested_types)
+
+# Convert back to pandas/numpy
+rdf = convert_to_numpy(adf)
+```
+
+This will properly convert:
+- List columns to `list[pyarrow]` type
+- Dictionary columns to `struct[pyarrow]` type
diff --git a/pandas_pyarrow/mappers/__init__.py b/pandas_pyarrow/mappers/__init__.py
@@ -3,6 +3,7 @@
 from .datetime_mapper import datetime_mapper, reverse_datetime_mapper
 from .db_types import mapper_db_types
 from .dtype_mapper import mapper_dict_dt, mapper_dict_object, reverse_mapper_dict
+from .nested_mapper import nested_mapper_dict, reverse_nested_mapper_dict
 from .numeric_mapper import numeric_mapper, reverse_numeric_mapper
 
 
@@ -15,6 +16,7 @@ def create_mapper() -> Dict[str, str]:
         **mapper_dict_dt,
         **mapper_dict_object,
         **mapper_db_types,
+        **nested_mapper_dict,
     )
     return all_mapper_dicts
 
@@ -28,6 +30,7 @@ def reverse_create_mapper(
         **reverse_numeric_mapper(["uint"], ["8", "16", "32", "64"]),
         **reverse_datetime_mapper(adapter=adapter),
         **reverse_mapper_dict,
+        **reverse_nested_mapper_dict,
     )
     return all_mapper_dicts
 
@@ -40,4 +43,6 @@ def reverse_create_mapper(
     "mapper_db_types",
     "datetime_mapper",
     "numeric_mapper",
+    "nested_mapper_dict",
+    "reverse_nested_mapper_dict",
 ]
diff --git a/pandas_pyarrow/mappers/nested_mapper.py b/pandas_pyarrow/mappers/nested_mapper.py
@@ -0,0 +1,12 @@
+from typing import Dict
+
+# Mapper for nested data types (lists and dictionaries)
+# Only define the pyarrow type names, not the raw type names
+nested_mapper_dict: Dict[str, str] = {}
+
+# Maps pyarrow types back to pandas/numpy types
+reverse_nested_mapper_dict: Dict[str, str] = {
+    "list[pyarrow]": "object",
+    "struct[pyarrow]": "object",
+    "map[pyarrow]": "object",
+}
diff --git a/pandas_pyarrow/pda_converter.py b/pandas_pyarrow/pda_converter.py
@@ -1,8 +1,14 @@
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple, Any
+import inspect
 
 from .mappers import create_mapper
 
+import numpy as np
 import pandas as pd
+import pyarrow as pa
+
+# Global dict to track nested type columns
+_NESTED_TYPE_COLUMNS = {}
 
 
 class PandasArrowConverter:
@@ -11,17 +17,22 @@
     Will override default mapping
     :param default_target_type: Optional string specifying the default data type to use if no mapping is found for a
     specific data type. Default is "string[pyarrow]".
+    :param detect_nested: Whether to detect and convert nested data types (lists, dictionaries) in object columns.
+    Default is True.
 
     """
 
     def __init__(
         self,
         custom_mapper: Optional[Dict[str, str]] = None,
         default_target_type: Optional[str] = "string[pyarrow]",
+        detect_nested: bool = True,
     ):
         self.additional_mapper_dicts = custom_mapper or {}
         self.defaults_dtype = default_target_type
+        self.detect_nested = detect_nested
         self._mapper = create_mapper() | self.additional_mapper_dicts
+        self.nested_type_registry = {}
 
     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
         """
@@ -32,15 +43,79 @@
         a mapping function to get the corresponding target dtypes, and applies the
         mapping to create a new DataFrame with updated dtypes.
 
+        If detect_nested is True, it will also attempt to detect columns containing
+        lists or dictionaries and convert them to the appropriate PyArrow types.
+
         :param df: A Pandas DataFrame whose column dtypes will be transformed.
         :type df: pd.DataFrame
         :return: A new Pandas DataFrame with transformed column dtypes.
         :rtype: pd.DataFrame
         """
+        # Clear the nested type registry
+        self.nested_type_registry = {}
+
         dtype_names: List[str] = df.dtypes.astype(str).tolist()
         target_dtype_names = self._map_dtype_names(dtype_names)
-        adf = df.astype(dict(zip(df.columns, target_dtype_names)))
+
+        # Get column to dtype mapping
+        col_to_dtype = dict(zip(df.columns, target_dtype_names))
+
+        # Convert the DataFrame
+        adf = df.astype({col: dtype for col, dtype in col_to_dtype.items()})
+
+        # If detect_nested is enabled, handle nested types
+        if self.detect_nested:
+            adf = self._handle_nested_types(df, adf)
+
+        # Store the nested types registry in the DataFrame as metadata
+        adf._nested_types_registry = self.nested_type_registry
+
         return adf
+
+    def _handle_nested_types(self, orig_df: pd.DataFrame, adf: pd.DataFrame) -> pd.DataFrame:
+        """
+        Handle nested data types by using PyArrow directly
+
+        :param orig_df: Original DataFrame
+        :param adf: DataFrame being processed
+        :return: DataFrame with nested types properly handled
+        """
+        # Find object columns to check for nested types
+        object_cols = [col for col in orig_df.columns if str(orig_df[col].dtype) == 'object']
+
+        for col in object_cols:
+            # Skip empty columns
+            if orig_df[col].isna().all():
+                continue
+
+            # Get first non-null value to check its type
+            sample = orig_df[col].dropna().iloc[0] if not orig_df[col].isna().all() else None
+
+            if sample is not None:
+                if isinstance(sample, list):
+                    # Convert to PyArrow and back to capture the list structure
+                    table = pa.Table.from_pandas(orig_df[[col]])
+                    adf[col] = table.to_pandas()[col]
+                    # Track as a list type column
+                    self.nested_type_registry[col] = "list[pyarrow]"
+
+                elif isinstance(sample, dict):
+                    # Convert to PyArrow and back to capture the dict structure
+                    table = pa.Table.from_pandas(orig_df[[col]])
+                    adf[col] = table.to_pandas()[col]
+                    # Track as a struct type column
+                    self.nested_type_registry[col] = "struct[pyarrow]"
+
+        return adf
+
+    def get_nested_dtypes(self, df: pd.DataFrame) -> Dict[str, str]:
+        """
+        Get the nested dtypes for a DataFrame that was processed by this converter
+
+        :param df: DataFrame to get nested types for
+        :return: Dictionary mapping column names to nested type names
+        """
+        return getattr(df, "_nested_types_registry", {})
 
     def _target_dtype_name(self, dtype_name: str) -> str:
         type_mapper = self._mapper

diff --git a/pandas_pyarrow/reverse_converter.py b/pandas_pyarrow/reverse_converter.py
@@ -1,11 +1,10 @@
-# reverse_converter.py
-
 from typing import Dict, List, Optional
 
 from .mappers import reverse_create_mapper
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 
 
 class ReversePandasArrowConverter:
@@ -56,6 +55,10 @@
 
         if "bool" in dtype_name:
             return "bool"
+
+        # Handle nested types
+        if "list[pyarrow]" in dtype_name or "struct[pyarrow]" in dtype_name or "map[pyarrow]" in dtype_name:
+            return "object"
 
         return self._mapper.get(dtype_name, self._default_target_type)
 

diff --git a/tests/unit/test_dtype_exists.py b/tests/unit/test_dtype_exists.py
@@ -4,6 +4,10 @@
 import pytest
 
 
+# Define nested types to skip in tests
+NESTED_TYPES = ["list[pyarrow]", "struct[pyarrow]", "map[pyarrow]"]
+
+
 @pytest.mark.parametrize(
     "str_types",
     [
@@ -15,6 +19,10 @@
 )
 def test_str_dtypes(str_types):
     for t in str_types:
+        # Skip nested types that pandas dtype cannot handle
+        if any(nested_type in t for nested_type in NESTED_TYPES):
+            continue
+
         pd_dtype = pd.api.types.pandas_dtype(t)
         if "pyarrow" not in t:
             assert str(pd_dtype) in str_types
@@ -29,6 +37,10 @@ def test_str_dtypes(str_types):
 )
 def test_str_dtypes_numpy(str_types):
     for t in str_types:
+        # Skip nested types that pandas dtype cannot handle
+        if any(nested_type in t for nested_type in NESTED_TYPES):
+            continue
+
         pd_dtype = pd.api.types.pandas_dtype(t)
         assert str(pd_dtype) in str_types
 
@@ -38,6 +50,10 @@ def test_str_dtypes_pyarrow():
     mapper_from_pyarrow = reverse_create_mapper(adapter="tz=")
     str_types = set(mapper_from_numpy.values()).union(set(mapper_from_pyarrow.keys()))
     for t in str_types:
+        # Skip nested types that pandas dtype cannot handle
+        if any(nested_type in t for nested_type in NESTED_TYPES):
+            continue
+
         pd_dtype = pd.api.types.pandas_dtype(t)
         pd_dtype_repr = repr(pd_dtype)
         assert pd_dtype_repr in str_types
diff --git a/tests/unit/test_nested_types.py b/tests/unit/test_nested_types.py
@@ -0,0 +1,50 @@
+from pandas_pyarrow import convert_to_numpy
+from pandas_pyarrow.pda_converter import PandasArrowConverter
+
+import pandas as pd
+import numpy as np
+import pytest
+from parametrization import Parametrization
+
+
+def create_df(column_values, data_type=None):
+    if data_type:
+        return pd.DataFrame({"test_column": column_values}, dtype=data_type)
+    return pd.DataFrame({"test_column": column_values})
+
+
+@Parametrization.autodetect_parameters()
+@Parametrization.case(
+    name="test list type detection",
+    df_data=create_df([[1, 2, 3], [4, 5], [6, 7, 8, 9]]),
+    expected_dtype="list[pyarrow]",
+    additional_mapper_dicts=None,
+)
+@Parametrization.case(
+    name="test dict type detection",
+    df_data=create_df([{'a': 1, 'b': 2}, {'c': 3}, {'d': 4, 'e': 5}]),
+    expected_dtype="struct[pyarrow]",
+    additional_mapper_dicts=None,
+)
+def test_nested_types(df_data, expected_dtype, additional_mapper_dicts):
+    converter = PandasArrowConverter(custom_mapper=additional_mapper_dicts)
+    adf = converter(df_data)
+
+    # Check that converter properly registered the nested type
+    assert converter.nested_type_registry['test_column'] == expected_dtype
+
+    # Check that the nested_types_registry is stored in the DataFrame
+    assert hasattr(adf, "_nested_types_registry")
+    assert adf._nested_types_registry['test_column'] == expected_dtype
+
+    # Use the getter method to verify the type
+    nested_types = converter.get_nested_dtypes(adf)
+    assert nested_types['test_column'] == expected_dtype
+
+    # Convert back to numpy and verify it's still object type
+    rdf = convert_to_numpy(adf)
+    assert rdf.dtypes.iloc[0] == np.dtype('O')
+
+    # Verify data is present and has the right shape
+    assert len(rdf) == len(df_data)
+    assert rdf.shape == df_data.shape