Update Native S3 Samples to use explicit credentials + dataset (#73)

mosenberg · web-flow · commit 25d19c53f0d2 · 2022-08-22T15:25:07.000+02:00
As we're reworking Hyper's internals of the experimental s3 feature,
we now require the user to explicitly specify credentials in a
s3-location constructor - even for public resources that don't require
authentication.

While working on this, we discovered that the nyc tlc taxi data set
we used in one example is currently inaccessible, therefore we change
this example to use one of our internal parquet datasets.
diff --git a/Community-Supported/native-s3/join-parquet-and-csv-on-s3.py b/Community-Supported/native-s3/join-parquet-and-csv-on-s3.py
@@ -8,6 +8,8 @@
 # CSV file which contains the orders that were returned by the customers
 RETURNS_DATASET = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/returns.csv")
 
+EMPTY_S3_CREDENTIALS = "ACCESS_KEY_ID => '', SECRET_ACCESS_KEY => ''"
+
 # We need to manually enable S3 connectivity as this is still an experimental feature
 with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU, parameters={"experimental_external_s3": "true"}) as hyper:
     # Create a connection to the Hyper process - we do not connect to a database
@@ -16,10 +18,10 @@
         # We use the `ARRAY` syntax in the CREATE TEMP EXTERNAL TABLE statement to specify multiple files to be unioned
         create_ext_orders_table = f"""
             CREATE TEMP EXTERNAL TABLE orders
-            FOR ARRAY[ S3_LOCATION({ORDERS_DATASET_2018}, REGION => 'us-west-2'),
-                       S3_LOCATION({ORDERS_DATASET_2019}, REGION => 'us-west-2'),
-                       S3_LOCATION({ORDERS_DATASET_2020}, REGION => 'us-west-2'),
-                       S3_LOCATION({ORDERS_DATASET_2021}, REGION => 'us-west-2')]
+            FOR ARRAY[ S3_LOCATION({ORDERS_DATASET_2018}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2'),
+                       S3_LOCATION({ORDERS_DATASET_2019}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2'),
+                       S3_LOCATION({ORDERS_DATASET_2020}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2'),
+                       S3_LOCATION({ORDERS_DATASET_2021}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2')]
             WITH (FORMAT => 'parquet')
         """
         connection.execute_command(create_ext_orders_table)
@@ -30,7 +32,7 @@
                 returned TEXT,
                 order_id TEXT
             )
-            FOR S3_LOCATION({RETURNS_DATASET}, REGION => 'us-west-2')
+            FOR S3_LOCATION({RETURNS_DATASET}, {EMPTY_S3_CREDENTIALS}, REGION => 'us-west-2')
             WITH (FORMAT => 'csv', HEADER => 'true', DELIMITER => ';')
         """
         connection.execute_command(create_ext_returns_table)
diff --git a/Community-Supported/native-s3/parquet-on-s3-to-hyper.py b/Community-Supported/native-s3/parquet-on-s3-to-hyper.py
@@ -1,26 +1,44 @@
 from tableauhyperapi import HyperProcess, Connection, Telemetry, CreateMode, SqlType, TableDefinition, TableName, Nullability, Inserter, escape_string_literal
 
 # Details and license of dataset: https://registry.opendata.aws/nyc-tlc-trip-records-pds/
+# NOTE: This dataset is currently not accessible - see above website for more details and to check if it has become available again
 TAXI_DATASET = escape_string_literal("s3://nyc-tlc/trip%20data/yellow_tripdata_2021-06.parquet") # May release fixes a bug so that %20 doesn't need to be escaped manually
+TAXI_DATASET_TABLE_NAME = "taxi_rides"
+TAXI_DATASET_DBNAME = "taxi-rides-2021-06.hyper"
+TAXI_DATASET_REGION = "us-east-1"
+
+# Currently (last checked Aug 8, 2022) the NYC taxi dataset is not available on AWS OpenData, however access may get restored in the future
+# Therefore, we're providing an alternative using our own orders data set in parquet format
+ORDERS_DATASET = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/orders_2018.parquet")
+ORDERS_DATASET_TABLE_NAME = "orders"
+ORDERS_DATASET_DBNAME = "orders-2018.hyper"
+ORDERS_DATASET_REGION = "us-west-2"
+
+# If AWS has restored access to the NYC taxi dataset, below config can be changed to reference the TAXI_DATASET when it becomes available again in the future
+CURRENT_DATASET = ORDERS_DATASET
+CURRENT_DATASET_TABLE_NAME = ORDERS_DATASET_TABLE_NAME
+CURRENT_DATASET_DBNAME = ORDERS_DATASET_DBNAME
+CURRENT_DATASET_REGION = ORDERS_DATASET_REGION
 
 # We need to manually enable S3 connectivity as this is still an experimental feature
 with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU, parameters={"experimental_external_s3": "true"}) as hyper:
     # Create a connection to the Hyper process and let it create a database file - if it exists, it's overwritten
-    with Connection(endpoint=hyper.endpoint, database="taxi-rides-2021-06.hyper", create_mode=CreateMode.CREATE_AND_REPLACE) as connection:
+    with Connection(endpoint=hyper.endpoint, database=CURRENT_DATASET_DBNAME, create_mode=CreateMode.CREATE_AND_REPLACE) as connection:
         
         # Use `TableName` so we do not have to worry about escaping in the SQL query we generate below
         # Note: This line does not create a table in Hyper, it just defines a name
-        taxi_rides = TableName("public", "taxi_rides")
+        table_name = TableName("public", CURRENT_DATASET_TABLE_NAME)
 
 		# Ingest the data from the parquet file into a Hyper Table
-        # Since the schema is stored inside the parquet file, we don't need to specify it explicitly here
-        cmd = f"CREATE TABLE {taxi_rides}" \
-              f" AS ( SELECT * FROM EXTERNAL(S3_LOCATION({TAXI_DATASET}), FORMAT => 'parquet'))"
+        # Since the schema is stored inside the parquet file, we don't need to specify it explicitly here  
+        cmd = f"CREATE TABLE {table_name}" \
+              f" AS ( SELECT * FROM EXTERNAL(S3_LOCATION({CURRENT_DATASET}, ACCESS_KEY_ID => '', SECRET_ACCESS_KEY => '', REGION => '{CURRENT_DATASET_REGION}')," \
+              f"                             FORMAT => 'parquet'))"
 
         # We use `execute_command` to send the CREATE TABLE statement to Hyper
         # This may take some time depending on your network connectivity so AWS S3
         connection.execute_command(cmd)
 
         # Let's check how many rows we loaded
-        ride_count = connection.execute_scalar_query(f"SELECT COUNT(*) FROM {taxi_rides}")
-        print (f"Loaded {ride_count} taxi rides")
+        row_count = connection.execute_scalar_query(f"SELECT COUNT(*) FROM {table_name}")
+        print (f"Loaded {row_count} rows")
diff --git a/Community-Supported/native-s3/query-csv-on-s3.py b/Community-Supported/native-s3/query-csv-on-s3.py
@@ -8,15 +8,18 @@
     with Connection(endpoint=hyper.endpoint) as connection:
         
         # Use the CREATE TEMP EXTERNAL TABLE syntax - this allows us to use the CSV file like a normal table name in SQL queries
-        # We do not need to specify credentials as the S3 bucket is publicly accessible; this may be different when used with your own data
+        # We specify empty credentials as the bucket is publicy accessible; this may be different when used with your own data
         create_external_table = f"""
             CREATE TEMP EXTERNAL TABLE orders(
                order_date DATE, 
                product_id TEXT, 
                category TEXT,
                sales DOUBLE PRECISION
             )
-            FOR S3_LOCATION({ORDERS_DATASET_S3}, REGION => 'us-west-2')
+            FOR S3_LOCATION({ORDERS_DATASET_S3}, 
+                            ACCESS_KEY_ID => '',
+                            SECRET_ACCESS_KEY => '',
+                            REGION => 'us-west-2')
             WITH (FORMAT => 'csv', HEADER => true)
         """
         # Create the external table using `execute_command` which sends an instruction to the database - we don't expect a result value