Prefer pyarrow to read/write parquet; use arro3 as fallback (#598)

kylebarron · web-flow · commit 213f26ba2bc1 · 2024-08-21T20:47:40.000Z
This is blocked on the next release of arro3
diff --git a/lonboard/_cli.py b/lonboard/_cli.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import json
 import webbrowser
 from pathlib import Path
@@ -22,7 +24,8 @@ def read_pyogrio(path: Path) -> Table:
         from pyogrio.raw import open_arrow
     except ImportError as e:
         raise ImportError(
-            "pyogrio is a required dependency for the CLI. "
+            "pyogrio is a required dependency for the CLI for reading data sources \n"
+            "other than GeoParquet.\n"
             "Install with `pip install pyogrio`."
         ) from e
 
@@ -58,29 +61,55 @@ def read_pyogrio(path: Path) -> Table:
     return table.with_schema(new_schema)
 
 
-def read_geoparquet(path: Path) -> Table:
-    """Read GeoParquet file at path using pyarrow
+def read_parquet(path: Path) -> tuple[Table, dict]:
+    """Read Parquet file using either pyarrow or arro3.
+
+    arro3.io.read_parquet is not multi-threaded (as of arro3 0.2.1), so pyarrow can be
+    up to 4x faster on an 8-core machine. Because of this, we prefer pyarrow if it's
+    installed, and fall back to arro3 otherwise.
 
     Args:
-        path: Path to GeoParquet file
+        path: path to Parquet file.
+
+    Raises:
+        ValueError: if there's no GeoParquet metadata in the file
+
+    Returns:
+        arro3 Table
     """
     try:
         import pyarrow.parquet as pq
-    except ImportError as e:
-        raise ImportError(
-            "pyarrow currently required for reading GeoParquet files.\n"
-            "Run `pip install pyarrow`."
-        ) from e
 
-    file = pq.ParquetFile(path)
-    geo_meta = file.metadata.metadata.get(b"geo")
-    if not geo_meta:
-        raise ValueError("Expected geo metadata in Parquet file")
+        file = pq.ParquetFile(path)
+        if b"geo" not in file.metadata.metadata:
+            raise ValueError("Expected geo metadata in Parquet file")
+        geo_meta = json.loads(file.metadata.metadata.get(b"geo"))
+
+        table = Table.from_arrow(file.read())
+
+        return table, geo_meta
 
-    pyarrow_table = file.read()
-    table = Table.from_arrow(pyarrow_table)
+    except ImportError:
+        from arro3.io import read_parquet
 
-    geo_meta = json.loads(geo_meta)
+        reader = read_parquet(path)
+
+        if "geo" not in reader.schema.metadata_str.keys():
+            raise ValueError("Expected geo metadata in Parquet file")
+
+        table = reader.read_all()
+        geo_meta = json.loads(table.schema.metadata_str["geo"])
+
+        return table, geo_meta
+
+
+def read_geoparquet(path: Path) -> Table:
+    """Read GeoParquet file at path using pyarrow or arro3.io
+
+    Args:
+        path: Path to GeoParquet file
+    """
+    table, geo_meta = read_parquet(path)
     geometry_column_name = geo_meta["primary_column"]
     geometry_column_index = [
         i for (i, name) in enumerate(table.schema.names) if name == geometry_column_name
diff --git a/lonboard/_geoarrow/_duckdb.py b/lonboard/_geoarrow/_duckdb.py
@@ -5,14 +5,14 @@
 from typing import TYPE_CHECKING, List, Optional, Union
 
 import numpy as np
-from arro3.compute import struct_field
 from arro3.core import (
     Array,
     ChunkedArray,
     Field,
     Table,
     fixed_size_list_array,
     list_array,
+    struct_field,
 )
 
 from lonboard._constants import EXTENSION_NAME
diff --git a/lonboard/_geoarrow/ops/bbox.py b/lonboard/_geoarrow/ops/bbox.py
@@ -7,8 +7,7 @@
 from typing import Tuple
 
 import numpy as np
-from arro3.compute import list_flatten
-from arro3.core import Array, ChunkedArray, DataType, Field
+from arro3.core import Array, ChunkedArray, DataType, Field, list_flatten
 
 from lonboard._constants import EXTENSION_NAME
 
diff --git a/lonboard/_geoarrow/ops/centroid.py b/lonboard/_geoarrow/ops/centroid.py
@@ -6,8 +6,7 @@
 from typing import Optional
 
 import numpy as np
-from arro3.compute import list_flatten
-from arro3.core import Array, ChunkedArray, DataType, Field
+from arro3.core import Array, ChunkedArray, DataType, Field, list_flatten
 
 from lonboard._constants import EXTENSION_NAME
 
diff --git a/lonboard/_geoarrow/ops/coord_layout.py b/lonboard/_geoarrow/ops/coord_layout.py
@@ -3,14 +3,14 @@
 from typing import Tuple
 
 import numpy as np
-from arro3.compute import struct_field
 from arro3.core import (
     Array,
     ChunkedArray,
     DataType,
     Field,
     Table,
     fixed_size_list_array,
+    struct_field,
 )
 
 from lonboard._constants import EXTENSION_NAME
diff --git a/lonboard/_geoarrow/ops/reproject.py b/lonboard/_geoarrow/ops/reproject.py
@@ -8,7 +8,6 @@
 from warnings import warn
 
 import numpy as np
-from arro3.compute import list_flatten, list_offsets
 from arro3.core import (
     Array,
     ChunkedArray,
@@ -17,6 +16,8 @@
     Table,
     fixed_size_list_array,
     list_array,
+    list_flatten,
+    list_offsets,
 )
 from pyproj import CRS, Transformer
 
diff --git a/lonboard/_serialization.py b/lonboard/_serialization.py
@@ -5,8 +5,7 @@
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import numpy as np
-from arro3.core import Array, ChunkedArray, Table
-from arro3.io import write_parquet
+from arro3.core import Array, ChunkedArray, RecordBatch, Table
 from traitlets import TraitError
 
 from lonboard.models import ViewState
@@ -27,28 +26,59 @@
 DEFAULT_MAX_NUM_CHUNKS = 32
 
 
+def write_parquet_batch(record_batch: RecordBatch) -> bytes:
+    """Write a RecordBatch to a Parquet file
+
+    We still use pyarrow.parquet.ParquetWriter if pyarrow is installed because pyarrow
+    has better encoding defaults. So Parquet files written by pyarrow are smaller by
+    default than files written by arro3.io.write_parquet.
+    """
+    # Occasionally it's possible for there to be empty batches in the
+    # pyarrow table. This will error when writing to parquet. We want to
+    # give a more informative error.
+    if record_batch.num_rows == 0:
+        raise ValueError("Batch with 0 rows.")
+
+    try:
+        import pyarrow as pa
+        import pyarrow.parquet as pq
+
+        bio = BytesIO()
+        with pq.ParquetWriter(
+            bio,
+            schema=pa.schema(record_batch.schema),
+            compression=DEFAULT_PARQUET_COMPRESSION,
+            compression_level=DEFAULT_PARQUET_COMPRESSION_LEVEL,
+        ) as writer:
+            writer.write_batch(
+                pa.record_batch(record_batch), row_group_size=record_batch.num_rows
+            )
+
+        return bio.getvalue()
+
+    except ImportError:
+        from arro3.io import write_parquet
+
+        compression_string = (
+            f"{DEFAULT_PARQUET_COMPRESSION}({DEFAULT_PARQUET_COMPRESSION_LEVEL})"
+        )
+        bio = BytesIO()
+        write_parquet(
+            record_batch,
+            bio,
+            compression=compression_string,
+            max_row_group_size=record_batch.num_rows,
+        )
+
+        return bio.getvalue()
+
+
 def serialize_table_to_parquet(table: Table, *, max_chunksize: int) -> List[bytes]:
     buffers: List[bytes] = []
     assert max_chunksize > 0
 
-    compression_string = (
-        f"{DEFAULT_PARQUET_COMPRESSION}({DEFAULT_PARQUET_COMPRESSION_LEVEL})"
-    )
     for record_batch in table.rechunk(max_chunksize=max_chunksize).to_batches():
-        with BytesIO() as bio:
-            # Occasionally it's possible for there to be empty batches in the
-            # pyarrow table. This will error when writing to parquet. We want to
-            # give a more informative error.
-            if record_batch.num_rows == 0:
-                raise ValueError("Batch with 0 rows.")
-
-            write_parquet(
-                table,
-                bio,
-                compression=compression_string,
-                max_row_group_size=record_batch.num_rows,
-            )
-            buffers.append(bio.getvalue())
+        buffers.append(write_parquet_batch(record_batch))
 
     return buffers
 
diff --git a/lonboard/_viz.py b/lonboard/_viz.py
@@ -18,8 +18,7 @@
 )
 
 import numpy as np
-from arro3.compute import struct_field
-from arro3.core import Array, ChunkedArray, Schema, Table
+from arro3.core import Array, ChunkedArray, Schema, Table, struct_field
 
 from lonboard._compat import check_pandas_version
 from lonboard._constants import EXTENSION_NAME
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,9 +32,9 @@ classifiers = [
 [tool.poetry.dependencies]
 python = "^3.8"
 anywidget = "^0.9.0"
-arro3-core = "^0.2.1"
-arro3-io = "^0.2.1"
-arro3-compute = "^0.2.1"
+arro3-core = "^0.3.0-beta.1"
+arro3-io = "^0.3.0-beta.1"
+arro3-compute = "^0.3.0-beta.1"
 ipywidgets = ">=7.6.0"
 numpy = ">=1.14"
 # The same version pin as geopandas

Original file line number	Diff line number	Diff line change
`@@ -18,8 +18,7 @@`
`18`	`18`	`)`
`19`	`19`
`20`	`20`	`import numpy as np`
`21`		`-from arro3.compute import struct_field`
`22`		`-from arro3.core import Array, ChunkedArray, Schema, Table`
	`21`	`+from arro3.core import Array, ChunkedArray, Schema, Table, struct_field`
`23`	`22`
`24`	`23`	`from lonboard._compat import check_pandas_version`
`25`	`24`	`from lonboard._constants import EXTENSION_NAME`