Skip to content

Commit 25d19c5

Browse files
authored
Update Native S3 Samples to use explicit credentials + dataset (#73)
As we're reworking Hyper's internals of the experimental s3 feature, we now require the user to explicitly specify credentials in a s3-location constructor - even for public resources that don't require authentication. While working on this, we discovered that the nyc tlc taxi data set we used in one example is currently inaccessible, therefore we change this example to use one of our internal parquet datasets.
1 parent bf2484c commit 25d19c5

File tree

3 files changed

+37
-14
lines changed

3 files changed

+37
-14
lines changed

Community-Supported/native-s3/join-parquet-and-csv-on-s3.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
# CSV file which contains the orders that were returned by the customers
99
RETURNS_DATASET = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/returns.csv")
1010

11+
EMPTY_S3_CREDENTIALS = "ACCESS_KEY_ID => '', SECRET_ACCESS_KEY => ''"
12+
1113
# We need to manually enable S3 connectivity as this is still an experimental feature
1214
with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU, parameters={"experimental_external_s3": "true"}) as hyper:
1315
# Create a connection to the Hyper process - we do not connect to a database
@@ -16,10 +18,10 @@
1618
# We use the `ARRAY` syntax in the CREATE TEMP EXTERNAL TABLE statement to specify multiple files to be unioned
1719
create_ext_orders_table = f"""
1820
CREATE TEMP EXTERNAL TABLE orders
19-
FOR ARRAY[ S3_LOCATION({ORDERS_DATASET_2018}, REGION => 'us-west-2'),
20-
S3_LOCATION({ORDERS_DATASET_2019}, REGION => 'us-west-2'),
21-
S3_LOCATION({ORDERS_DATASET_2020}, REGION => 'us-west-2'),
22-
S3_LOCATION({ORDERS_DATASET_2021}, REGION => 'us-west-2')]
21+
FOR ARRAY[ S3_LOCATION({ORDERS_DATASET_2018}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2'),
22+
S3_LOCATION({ORDERS_DATASET_2019}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2'),
23+
S3_LOCATION({ORDERS_DATASET_2020}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2'),
24+
S3_LOCATION({ORDERS_DATASET_2021}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2')]
2325
WITH (FORMAT => 'parquet')
2426
"""
2527
connection.execute_command(create_ext_orders_table)
@@ -30,7 +32,7 @@
3032
returned TEXT,
3133
order_id TEXT
3234
)
33-
FOR S3_LOCATION({RETURNS_DATASET}, REGION => 'us-west-2')
35+
FOR S3_LOCATION({RETURNS_DATASET}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2')
3436
WITH (FORMAT => 'csv', HEADER => 'true', DELIMITER => ';')
3537
"""
3638
connection.execute_command(create_ext_returns_table)
Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,44 @@
11
from tableauhyperapi import HyperProcess, Connection, Telemetry, CreateMode, SqlType, TableDefinition, TableName, Nullability, Inserter, escape_string_literal
22

33
# Details and license of dataset: https://registry.opendata.aws/nyc-tlc-trip-records-pds/
4+
# NOTE: This dataset is currently not accessible - see above website for more details and to check if it has become available again
45
TAXI_DATASET = escape_string_literal("s3://nyc-tlc/trip%20data/yellow_tripdata_2021-06.parquet") # May release fixes a bug so that %20 doesn't need to be escaped manually
6+
TAXI_DATASET_TABLE_NAME = "taxi_rides"
7+
TAXI_DATASET_DBNAME = "taxi-rides-2021-06.hyper"
8+
TAXI_DATASET_REGION = "us-east-1"
9+
10+
# Currently (last checked Aug 8, 2022) the NYC taxi dataset is not available on AWS OpenData, however access may get restored in the future
11+
# Therefore, we're providing an alternative using our own orders data set in parquet format
12+
ORDERS_DATASET = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/orders_2018.parquet")
13+
ORDERS_DATASET_TABLE_NAME = "orders"
14+
ORDERS_DATASET_DBNAME = "orders-2018.hyper"
15+
ORDERS_DATASET_REGION = "us-west-2"
16+
17+
# If AWS has restored access to the NYC taxi dataset, below config can be changed to reference the TAXI_DATASET when it becomes available again in the future
18+
CURRENT_DATASET = ORDERS_DATASET
19+
CURRENT_DATASET_TABLE_NAME = ORDERS_DATASET_TABLE_NAME
20+
CURRENT_DATASET_DBNAME = ORDERS_DATASET_DBNAME
21+
CURRENT_DATASET_REGION = ORDERS_DATASET_REGION
522

623
# We need to manually enable S3 connectivity as this is still an experimental feature
724
with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU, parameters={"experimental_external_s3": "true"}) as hyper:
825
# Create a connection to the Hyper process and let it create a database file - if it exists, it's overwritten
9-
with Connection(endpoint=hyper.endpoint, database="taxi-rides-2021-06.hyper", create_mode=CreateMode.CREATE_AND_REPLACE) as connection:
26+
with Connection(endpoint=hyper.endpoint, database=CURRENT_DATASET_DBNAME, create_mode=CreateMode.CREATE_AND_REPLACE) as connection:
1027

1128
# Use `TableName` so we do not have to worry about escaping in the SQL query we generate below
1229
# Note: This line does not create a table in Hyper, it just defines a name
13-
taxi_rides = TableName("public", "taxi_rides")
30+
table_name = TableName("public", CURRENT_DATASET_TABLE_NAME)
1431

1532
# Ingest the data from the parquet file into a Hyper Table
16-
# Since the schema is stored inside the parquet file, we don't need to specify it explicitly here
17-
cmd = f"CREATE TABLE {taxi_rides}" \
18-
f" AS ( SELECT * FROM EXTERNAL(S3_LOCATION({TAXI_DATASET}), FORMAT => 'parquet'))"
33+
# Since the schema is stored inside the parquet file, we don't need to specify it explicitly here
34+
cmd = f"CREATE TABLE {table_name}" \
35+
f" AS ( SELECT * FROM EXTERNAL(S3_LOCATION({CURRENT_DATASET}, ACCESS_KEY_ID => '', SECRET_ACCESS_KEY => '', REGION => '{CURRENT_DATASET_REGION}')," \
36+
f" FORMAT => 'parquet'))"
1937

2038
# We use `execute_command` to send the CREATE TABLE statement to Hyper
2139
# This may take some time depending on your network connectivity so AWS S3
2240
connection.execute_command(cmd)
2341

2442
# Let's check how many rows we loaded
25-
ride_count = connection.execute_scalar_query(f"SELECT COUNT(*) FROM {taxi_rides}")
26-
print (f"Loaded {ride_count} taxi rides")
43+
row_count = connection.execute_scalar_query(f"SELECT COUNT(*) FROM {table_name}")
44+
print (f"Loaded {row_count} rows")

Community-Supported/native-s3/query-csv-on-s3.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,18 @@
88
with Connection(endpoint=hyper.endpoint) as connection:
99

1010
# Use the CREATE TEMP EXTERNAL TABLE syntax - this allows us to use the CSV file like a normal table name in SQL queries
11-
# We do not need to specify credentials as the S3 bucket is publicly accessible; this may be different when used with your own data
11+
# We specify empty credentials as the bucket is publicy accessible; this may be different when used with your own data
1212
create_external_table = f"""
1313
CREATE TEMP EXTERNAL TABLE orders(
1414
order_date DATE,
1515
product_id TEXT,
1616
category TEXT,
1717
sales DOUBLE PRECISION
1818
)
19-
FOR S3_LOCATION({ORDERS_DATASET_S3}, REGION => 'us-west-2')
19+
FOR S3_LOCATION({ORDERS_DATASET_S3},
20+
ACCESS_KEY_ID => '',
21+
SECRET_ACCESS_KEY => '',
22+
REGION => 'us-west-2')
2023
WITH (FORMAT => 'csv', HEADER => true)
2124
"""
2225
# Create the external table using `execute_command` which sends an instruction to the database - we don't expect a result value

0 commit comments

Comments
 (0)