From 032ff40de37c1b8f913cb8053bcb023ab4810894 Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Fri, 24 Oct 2025 11:11:00 -0400 Subject: [PATCH 1/2] POC for easier column extraction --- python/datafusion/dataframe.py | 20 ++++++++++++++++++++ python/tests/test_dataframe.py | 17 +++++++++++++++++ src/dataframe.rs | 26 ++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index c1b649e33..ec50ba403 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -1068,6 +1068,26 @@ def to_arrow_table(self) -> pa.Table: """ return self.df.to_arrow_table() + def to_arrow_array(self) -> pa.ChunkedArray: + """Execute the :py:class:`DataFrame` and convert it into an Arrow Array. + + Only valid when :py:class:`DataFrame` contains a single column. + + Returns: + Arrow Array. + """ + return self.df.to_arrow_array() + + def column(self, column_name: str) -> pa.ChunkedArray: + """Execute the :py:class:`DataFrame` and convert it into an Arrow Array. + + Only valid when :py:class:`DataFrame` contains a single column. + + Returns: + Arrow Array. + """ + return self.df.column(column_name) + def execute_stream(self) -> RecordBatchStream: """Executes this DataFrame and returns a stream over a single partition. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index eb686dd19..34661805a 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1694,6 +1694,23 @@ def test_to_arrow_table(df): assert set(pyarrow_table.column_names) == {"a", "b", "c"} +def test_to_arrow_array(df): + # Convert datafusion dataframe to pyarrow Array + pyarrow_array = df.select("a").to_arrow_array() + assert isinstance(pyarrow_array, pa.ChunkedArray) + assert pyarrow_array.to_numpy().shape == (3,) + + with pytest.raises(ValueError, match="single column"): + df.to_arrow_array() + + +def test_column(df): + # Grab column from datafusion dataframe as pyarrow Array + pyarrow_array = df.column("a") + assert isinstance(pyarrow_array, pa.ChunkedArray) + assert pyarrow_array.to_numpy().shape == (3,) + + def test_execute_stream(df): stream = df.execute_stream() assert all(batch is not None for batch in stream) diff --git a/src/dataframe.rs b/src/dataframe.rs index 5882acf76..f42e4f1fb 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -859,6 +859,32 @@ impl PyDataFrame { Ok(()) } + /// Extract a single column directly as an Arrow Array + fn column(&self, py: Python<'_>, column_name: &str) -> PyResult { + let single_column_df = self.select(vec![PyExpr::column(column_name)])?; + let array = single_column_df.to_arrow_array(py)?; + Ok(array) + } + + /// Convert to Arrow Array + /// Collect the batches and pass to Arrow Array + fn to_arrow_array(&self, py: Python<'_>) -> PyResult { + println!("Converting to Arrow table"); + let table = self.to_arrow_table(py)?; + println!("Table"); + let args = table.getattr(py, "column_names")?; + let column_names = args.extract::>(py)?; + if column_names.len() != 1 { + return Err(PyValueError::new_err( + "to_arrow_array only supports single column DataFrames", + )); + } + print!("Args"); + let column_name = column_names.get_item(0)?; + let array: PyObject = table.call_method1(py, "column", (column_name,))?; + Ok(array) + } + /// Convert to Arrow Table /// Collect the batches and pass to Arrow Table fn to_arrow_table(&self, py: Python<'_>) -> PyResult { From d36f33da9303855cbaf9fe270c9e19d2ebac3d80 Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Fri, 24 Oct 2025 11:14:31 -0400 Subject: [PATCH 2/2] Remove debugging statements --- src/dataframe.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index f42e4f1fb..87b6f1feb 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -869,9 +869,7 @@ impl PyDataFrame { /// Convert to Arrow Array /// Collect the batches and pass to Arrow Array fn to_arrow_array(&self, py: Python<'_>) -> PyResult { - println!("Converting to Arrow table"); let table = self.to_arrow_table(py)?; - println!("Table"); let args = table.getattr(py, "column_names")?; let column_names = args.extract::>(py)?; if column_names.len() != 1 { @@ -879,7 +877,6 @@ impl PyDataFrame { "to_arrow_array only supports single column DataFrames", )); } - print!("Args"); let column_name = column_names.get_item(0)?; let array: PyObject = table.call_method1(py, "column", (column_name,))?; Ok(array)