diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index c1b649e33..ec50ba403 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -1068,6 +1068,26 @@ def to_arrow_table(self) -> pa.Table: """ return self.df.to_arrow_table() + def to_arrow_array(self) -> pa.ChunkedArray: + """Execute the :py:class:`DataFrame` and convert it into an Arrow Array. + + Only valid when :py:class:`DataFrame` contains a single column. + + Returns: + Arrow Array. + """ + return self.df.to_arrow_array() + + def column(self, column_name: str) -> pa.ChunkedArray: + """Execute the :py:class:`DataFrame` and convert it into an Arrow Array. + + Only valid when :py:class:`DataFrame` contains a single column. + + Returns: + Arrow Array. + """ + return self.df.column(column_name) + def execute_stream(self) -> RecordBatchStream: """Executes this DataFrame and returns a stream over a single partition. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index eb686dd19..34661805a 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1694,6 +1694,23 @@ def test_to_arrow_table(df): assert set(pyarrow_table.column_names) == {"a", "b", "c"} +def test_to_arrow_array(df): + # Convert datafusion dataframe to pyarrow Array + pyarrow_array = df.select("a").to_arrow_array() + assert isinstance(pyarrow_array, pa.ChunkedArray) + assert pyarrow_array.to_numpy().shape == (3,) + + with pytest.raises(ValueError, match="single column"): + df.to_arrow_array() + + +def test_column(df): + # Grab column from datafusion dataframe as pyarrow Array + pyarrow_array = df.column("a") + assert isinstance(pyarrow_array, pa.ChunkedArray) + assert pyarrow_array.to_numpy().shape == (3,) + + def test_execute_stream(df): stream = df.execute_stream() assert all(batch is not None for batch in stream) diff --git a/src/dataframe.rs b/src/dataframe.rs index 5882acf76..87b6f1feb 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -859,6 +859,29 @@ impl PyDataFrame { Ok(()) } + /// Extract a single column directly as an Arrow Array + fn column(&self, py: Python<'_>, column_name: &str) -> PyResult { + let single_column_df = self.select(vec![PyExpr::column(column_name)])?; + let array = single_column_df.to_arrow_array(py)?; + Ok(array) + } + + /// Convert to Arrow Array + /// Collect the batches and pass to Arrow Array + fn to_arrow_array(&self, py: Python<'_>) -> PyResult { + let table = self.to_arrow_table(py)?; + let args = table.getattr(py, "column_names")?; + let column_names = args.extract::>(py)?; + if column_names.len() != 1 { + return Err(PyValueError::new_err( + "to_arrow_array only supports single column DataFrames", + )); + } + let column_name = column_names.get_item(0)?; + let array: PyObject = table.call_method1(py, "column", (column_name,))?; + Ok(array) + } + /// Convert to Arrow Table /// Collect the batches and pass to Arrow Table fn to_arrow_table(&self, py: Python<'_>) -> PyResult {