Update python minimum version to 3.10 (#1296)

timsaucer · web-flow · commit a605b6190263 · 2025-10-29T15:16:26.000-04:00
* Set minimum python version to 3.10 since 3.9 is end of life

* Ruff updates after 3.10

* Update pyo3 to use 3.10

* Update dependencies for 3.14
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -33,11 +33,11 @@ jobs:
       fail-fast: false
       matrix:
         python-version:
-          - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"
           - "3.13"
+          - "3.14"
         toolchain:
           - "stable"
 
diff --git a/Cargo.toml b/Cargo.toml
@@ -35,7 +35,7 @@ substrait = ["dep:datafusion-substrait"]
 
 [dependencies]
 tokio = { version = "1.47", features = ["macros", "rt", "rt-multi-thread", "sync"] }
-pyo3 = { version = "0.25", features = ["extension-module", "abi3", "abi3-py39"] }
+pyo3 = { version = "0.25", features = ["extension-module", "abi3", "abi3-py310"] }
 pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"]}
 pyo3-log = "0.12.4"
 arrow = { version = "56", features = ["pyarrow"] }
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,7 @@ name = "datafusion"
 description = "Build and run queries against data"
 readme = "README.md"
 license = { file = "LICENSE.txt" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 keywords = ["datafusion", "dataframe", "rust", "query-engine"]
 classifiers = [
     "Development Status :: 2 - Pre-Alpha",
@@ -35,15 +35,19 @@ classifiers = [
     "Operating System :: Microsoft :: Windows",
     "Operating System :: POSIX :: Linux",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
     "Programming Language :: Python",
     "Programming Language :: Rust",
 ]
-dependencies = ["pyarrow>=11.0.0", "typing-extensions;python_version<'3.13'"]
+dependencies = [
+    "pyarrow>=11.0.0;python_version<'3.14'",
+    "pyarrow>=22.0.0;python_version>='3.14'",
+    "typing-extensions;python_version<'3.13'"
+]
 dynamic = ["version"]
 
 [project.urls]
@@ -147,8 +151,10 @@ ignore-words-list = [
 [dependency-groups]
 dev = [
     "maturin>=1.8.1",
-    "numpy>1.25.0",
-    "pre-commit>=4.0.0",
+    "numpy>1.25.0;python_version<'3.14'",
+    "numpy>=2.3.2;python_version>='3.14'",
+    "pre-commit>=4.3.0",
+    "pyyaml>=6.0.3",
     "pytest>=7.4.4",
     "pytest-asyncio>=0.23.3",
     "ruff>=0.9.1",
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -586,7 +586,7 @@ def with_columns(
             if isinstance(expr, str):
                 expressions.append(self.parse_sql_expr(expr).expr)
             elif isinstance(expr, Iterable) and not isinstance(
-                expr, (Expr, str, bytes, bytearray)
+                expr, Expr | str | bytes | bytearray
             ):
                 expressions.extend(
                     [
@@ -639,7 +639,7 @@ def aggregate(
         """
         group_by_list = (
             list(group_by)
-            if isinstance(group_by, Sequence) and not isinstance(group_by, (Expr, str))
+            if isinstance(group_by, Sequence) and not isinstance(group_by, Expr | str)
             else [group_by]
         )
         aggs_list = (
diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py
@@ -271,7 +271,7 @@ def _iter(
     ) -> Iterable[expr_internal.Expr]:
         for expr in items:
             if isinstance(expr, Iterable) and not isinstance(
-                expr, (Expr, str, bytes, bytearray)
+                expr, Expr | str | bytes | bytearray
             ):
                 # Treat string-like objects as atomic to surface standard errors
                 yield from _iter(expr)
@@ -308,7 +308,7 @@ def expr_list_to_raw_expr_list(
     expr_list: Optional[list[Expr] | Expr],
 ) -> Optional[list[expr_internal.Expr]]:
     """Convert a sequence of expressions or column names to raw expressions."""
-    if isinstance(expr_list, (Expr, str)):
+    if isinstance(expr_list, Expr | str):
         expr_list = [expr_list]
     if expr_list is None:
         return None
@@ -326,7 +326,7 @@ def sort_list_to_raw_sort_list(
     sort_list: Optional[_typing.Union[Sequence[SortKey], SortKey]],
 ) -> Optional[list[expr_internal.SortExpr]]:
     """Helper function to return an optional sort list to raw variant."""
-    if isinstance(sort_list, (Expr, SortExpr, str)):
+    if isinstance(sort_list, Expr | SortExpr | str):
         sort_list = [sort_list]
     if sort_list is None:
         return None
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
@@ -567,7 +567,7 @@ def test_array_functions(stmt, py_expr):
 
     col = column("arr")
     query_result = df.select(stmt(col)).collect()[0].column(0)
-    for a, b in zip(query_result, py_expr(data)):
+    for a, b in zip(query_result, py_expr(data), strict=False):
         np.testing.assert_array_almost_equal(
             np.array(a.as_py(), dtype=float), np.array(b, dtype=float)
         )
@@ -582,7 +582,7 @@ def test_array_function_flatten():
     stmt = f.flatten(literal(data))
     py_expr = [py_flatten(data)]
     query_result = df.select(stmt).collect()[0].column(0)
-    for a, b in zip(query_result, py_expr):
+    for a, b in zip(query_result, py_expr, strict=False):
         np.testing.assert_array_almost_equal(
             np.array(a.as_py(), dtype=float), np.array(b, dtype=float)
         )
@@ -600,7 +600,7 @@ def test_array_function_cardinality():
 
     query_result = df.select(stmt).collect()[0].column(0)
 
-    for a, b in zip(query_result, py_expr):
+    for a, b in zip(query_result, py_expr, strict=False):
         np.testing.assert_array_equal(
             np.array([a.as_py()], dtype=int), np.array([b], dtype=int)
         )
@@ -631,7 +631,7 @@ def test_make_array_functions(make_func):
     ]
 
     query_result = df.select(stmt).collect()[0].column(0)
-    for a, b in zip(query_result, py_expr):
+    for a, b in zip(query_result, py_expr, strict=False):
         np.testing.assert_array_equal(
             np.array(a.as_py(), dtype=str), np.array(b, dtype=str)
         )
@@ -664,7 +664,7 @@ def test_array_function_obj_tests(stmt, py_expr):
     batch = pa.RecordBatch.from_arrays([np.array(data, dtype=object)], names=["arr"])
     df = ctx.create_dataframe([[batch]])
     query_result = np.array(df.select(stmt).collect()[0].column(0))
-    for a, b in zip(query_result, py_expr(data)):
+    for a, b in zip(query_result, py_expr(data), strict=False):
         assert a == b
 
 
diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py
@@ -194,7 +194,7 @@ def test_register_parquet_partitioned(ctx, tmp_path, path_to_str, legacy_data_ty
     result = pa.Table.from_batches(result)
 
     rd = result.to_pydict()
-    assert dict(zip(rd["grp"], rd["cnt"])) == {"a": 3, "b": 1}
+    assert dict(zip(rd["grp"], rd["cnt"], strict=False)) == {"a": 3, "b": 1}
 
 
 @pytest.mark.parametrize("path_to_str", [True, False])
@@ -340,7 +340,10 @@ def test_execute(ctx, tmp_path):
         result_values.extend(pydict["cnt"])
 
     result_keys, result_values = (
-        list(t) for t in zip(*sorted(zip(result_keys, result_values)))
+        list(t)
+        for t in zip(
+            *sorted(zip(result_keys, result_values, strict=False)), strict=False
+        )
     )
 
     assert result_keys == [1, 2, 3, 11, 12]
@@ -467,7 +470,7 @@ def test_simple_select(ctx, tmp_path, arr):
     # In DF 43.0.0 we now default to having BinaryView and StringView
     # so the array that is saved to the parquet is slightly different
     # than the array read. Convert to values for comparison.
-    if isinstance(result, (pa.BinaryViewArray, pa.StringViewArray)):
+    if isinstance(result, pa.BinaryViewArray | pa.StringViewArray):
         arr = arr.tolist()
         result = result.tolist()
 
@@ -524,12 +527,12 @@ def test_register_listing_table(
     result = pa.Table.from_batches(result)
 
     rd = result.to_pydict()
-    assert dict(zip(rd["grp"], rd["count"])) == {"a": 5, "b": 2}
+    assert dict(zip(rd["grp"], rd["count"], strict=False)) == {"a": 5, "b": 2}
 
     result = ctx.sql(
         "SELECT grp, COUNT(*) AS count FROM my_table WHERE date='2020-10-05' GROUP BY grp"  # noqa: E501
     ).collect()
     result = pa.Table.from_batches(result)
 
     rd = result.to_pydict()
-    assert dict(zip(rd["grp"], rd["count"])) == {"a": 3, "b": 2}
+    assert dict(zip(rd["grp"], rd["count"], strict=False)) == {"a": 3, "b": 2}
diff --git a/uv.lock b/uv.lock