From fcfa5231bc8e57844f5ad46ac8cc81eda06dc340 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 25 Oct 2023 07:12:50 -0500 Subject: [PATCH] Use polars.scan_parquet --- ci/environment.yml | 3 ++- tests/tpch/test_polars.py | 14 +++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index 18975fbf49..23a3beb494 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -41,5 +41,6 @@ dependencies: - pyspark ==3.4.1 - openjdk ==20.0.2 - python-duckdb ==0.9.1 - - polars >=0.19.3,<=0.19.10 # 0.19.3 on Windows, 0.19.10 on Linux and MacOS - altair + - pip: + - polars==0.19.12rc1 diff --git a/tests/tpch/test_polars.py b/tests/tpch/test_polars.py index 6925513381..36218cc2f8 100644 --- a/tests/tpch/test_polars.py +++ b/tests/tpch/test_polars.py @@ -1,28 +1,32 @@ from datetime import datetime import pytest -from pyarrow.dataset import dataset pl = pytest.importorskip("polars") def read_data(filename): - pyarrow_dataset = dataset(filename, format="parquet") - return pl.scan_pyarrow_dataset(pyarrow_dataset) + # This is still faster + # import pyarrrow.dataset + # ds = pyarrow.dataset(filename, format="parquet") + # return pl.scan_pyarrow_dataset(ds) if filename.startswith("s3://"): import boto3 session = boto3.session.Session() credentials = session.get_credentials() - return pl.scan_parquet( - filename, + + df = pl.scan_parquet( + filename + "/*.parquet", storage_options={ "aws_access_key_id": credentials.access_key, "aws_secret_access_key": credentials.secret_key, "region": "us-east-2", + "session_token": credentials.token, }, ) + return df else: return pl.scan_parquet(filename + "/*")