|
1 | 1 | from tableauhyperapi import HyperProcess, Connection, Telemetry, CreateMode, SqlType, TableDefinition, TableName, Nullability, Inserter, escape_string_literal |
2 | 2 |
|
3 | 3 | # Details and license of dataset: https://registry.opendata.aws/nyc-tlc-trip-records-pds/ |
| 4 | +# NOTE: This dataset is currently not accessible - see above website for more details and to check if it has become available again |
4 | 5 | TAXI_DATASET = escape_string_literal("s3://nyc-tlc/trip%20data/yellow_tripdata_2021-06.parquet") # May release fixes a bug so that %20 doesn't need to be escaped manually |
| 6 | +TAXI_DATASET_TABLE_NAME = "taxi_rides" |
| 7 | +TAXI_DATASET_DBNAME = "taxi-rides-2021-06.hyper" |
| 8 | +TAXI_DATASET_REGION = "us-east-1" |
| 9 | + |
| 10 | +# Currently (last checked Aug 8, 2022) the NYC taxi dataset is not available on AWS OpenData, however access may get restored in the future |
| 11 | +# Therefore, we're providing an alternative using our own orders data set in parquet format |
| 12 | +ORDERS_DATASET = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/orders_2018.parquet") |
| 13 | +ORDERS_DATASET_TABLE_NAME = "orders" |
| 14 | +ORDERS_DATASET_DBNAME = "orders-2018.hyper" |
| 15 | +ORDERS_DATASET_REGION = "us-west-2" |
| 16 | + |
| 17 | +# If AWS has restored access to the NYC taxi dataset, below config can be changed to reference the TAXI_DATASET when it becomes available again in the future |
| 18 | +CURRENT_DATASET = ORDERS_DATASET |
| 19 | +CURRENT_DATASET_TABLE_NAME = ORDERS_DATASET_TABLE_NAME |
| 20 | +CURRENT_DATASET_DBNAME = ORDERS_DATASET_DBNAME |
| 21 | +CURRENT_DATASET_REGION = ORDERS_DATASET_REGION |
5 | 22 |
|
6 | 23 | # We need to manually enable S3 connectivity as this is still an experimental feature |
7 | 24 | with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU, parameters={"experimental_external_s3": "true"}) as hyper: |
8 | 25 | # Create a connection to the Hyper process and let it create a database file - if it exists, it's overwritten |
9 | | - with Connection(endpoint=hyper.endpoint, database="taxi-rides-2021-06.hyper", create_mode=CreateMode.CREATE_AND_REPLACE) as connection: |
| 26 | + with Connection(endpoint=hyper.endpoint, database=CURRENT_DATASET_DBNAME, create_mode=CreateMode.CREATE_AND_REPLACE) as connection: |
10 | 27 |
|
11 | 28 | # Use `TableName` so we do not have to worry about escaping in the SQL query we generate below |
12 | 29 | # Note: This line does not create a table in Hyper, it just defines a name |
13 | | - taxi_rides = TableName("public", "taxi_rides") |
| 30 | + table_name = TableName("public", CURRENT_DATASET_TABLE_NAME) |
14 | 31 |
|
15 | 32 | # Ingest the data from the parquet file into a Hyper Table |
16 | | - # Since the schema is stored inside the parquet file, we don't need to specify it explicitly here |
17 | | - cmd = f"CREATE TABLE {taxi_rides}" \ |
18 | | - f" AS ( SELECT * FROM EXTERNAL(S3_LOCATION({TAXI_DATASET}), FORMAT => 'parquet'))" |
| 33 | + # Since the schema is stored inside the parquet file, we don't need to specify it explicitly here |
| 34 | + cmd = f"CREATE TABLE {table_name}" \ |
| 35 | + f" AS ( SELECT * FROM EXTERNAL(S3_LOCATION({CURRENT_DATASET}, ACCESS_KEY_ID => '', SECRET_ACCESS_KEY => '', REGION => '{CURRENT_DATASET_REGION}')," \ |
| 36 | + f" FORMAT => 'parquet'))" |
19 | 37 |
|
20 | 38 | # We use `execute_command` to send the CREATE TABLE statement to Hyper |
21 | 39 | # This may take some time depending on your network connectivity so AWS S3 |
22 | 40 | connection.execute_command(cmd) |
23 | 41 |
|
24 | 42 | # Let's check how many rows we loaded |
25 | | - ride_count = connection.execute_scalar_query(f"SELECT COUNT(*) FROM {taxi_rides}") |
26 | | - print (f"Loaded {ride_count} taxi rides") |
| 43 | + row_count = connection.execute_scalar_query(f"SELECT COUNT(*) FROM {table_name}") |
| 44 | + print (f"Loaded {row_count} rows") |
0 commit comments