-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcount_parquet_rows.py
More file actions
executable file
·67 lines (54 loc) · 1.78 KB
/
count_parquet_rows.py
File metadata and controls
executable file
·67 lines (54 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.12"
# category = "data"
# dependencies = [
# "typer>=0.15.0",
# "rich>=13.0.0",
# "pyarrow",
# ]
# ///
"""
Count rows in a parquet dataset (local or S3) using metadata headers.
"""
import os
from typing import Annotated
import pyarrow.dataset as ds
import typer
from rich import print
def main(
dataset_path: Annotated[str, typer.Argument(help="Local file path or S3 URI.")],
) -> None:
"""
Count the number of rows in a parquet file/dataset without reading data into memory.
Works by reading just the metadata headers. Supports:
- Single parquet files
- Directories of parquet shards
- Hive-style partitioned datasets
- Local paths and S3 URIs
Arguments:
DATASET_PATH: Local file path or S3 URI to the parquet dataset.
Examples:
# Local file
uv run https://tools.ricardodecal.com/python/count_parquet_rows.py ./data.parquet
# Directory of shards
uv run https://tools.ricardodecal.com/python/count_parquet_rows.py ./data_dir/
# S3 URI
uv run https://tools.ricardodecal.com/python/count_parquet_rows.py s3://my-bucket/data.parquet
"""
try:
# Expand user path (~) if it's a local path
if not dataset_path.startswith("s3://"):
dataset_path = os.path.expanduser(dataset_path)
dataset = ds.dataset(dataset_path, format="parquet")
row_count = sum(
row_group.num_rows
for fragment in dataset.get_fragments()
for row_group in fragment.row_groups
)
print(row_count)
except Exception as e:
print(f"[bold red]Error:[/bold red] Failed to count rows: {e}")
raise typer.Exit(code=1)
if __name__ == "__main__":
typer.run(main)