Incremental/Full refresh from CSV appears to be working.

dames · dames · commit 6abd0d5ef29f · 2018-05-08T15:53:20.000+10:00
diff --git a/configuration/ColumnTest.json b/configuration/ColumnTest.json
@@ -2,15 +2,15 @@
 
   "source_table": {
     "name": "ColumnTest",
-    "schema": "dbo"
+    "schema": "dbo",
+    "primary_key": "id"
   },
-  "target_schema": "load",
+  "target_schema": "rdl_integration_tests",
   "stage_table": "stage_source_data",
   "load_table": "load_source_data",
 
   "batch": {
-    "size": 100000,
-    "source_unique_column": "id"
+    "size": 100000
   },
   "columns": [
     {
diff --git a/full_refresh_from_csv.cmd b/full_refresh_from_csv.cmd
@@ -0,0 +1,5 @@
+py relational_data_loader_project\__main__.py csv://./test_data/incremental_refresh postgresql+psycopg2://postgres:xxxx@localhost/dest_dw c:\_dev\relational-data-loader\configuration\ --log-level DEBUG
+
+
+
+
diff --git a/relational_data_loader_project/BatchDataLoader.py b/relational_data_loader_project/BatchDataLoader.py
@@ -35,7 +35,7 @@ def load_batch(self, previous_batch_key):
         self.write_data_frame_to_table(data_frame)
         batch_tracker.load_completed_successfully()
 
-        last_key_returned = data_frame.iloc[-1][self.batch_configuration['source_unique_column']]
+        last_key_returned = data_frame.iloc[-1][self.source_table_configuration['primary_key']]
 
         self.logger.info("Batch key {0} Completed. {1}".format(last_key_returned, batch_tracker.get_statistics()))
         return last_key_returned
diff --git a/relational_data_loader_project/DataLoadManager.py b/relational_data_loader_project/DataLoadManager.py
@@ -12,30 +12,38 @@ def __init__(self, configuration_path, data_source, logger=None):
         self.configuration_path = configuration_path
         self.data_source = data_source
 
-    def start_imports(self, target_engine, full_load):
+    def start_imports(self, target_engine, full_refresh):
         for file in os.listdir(self.configuration_path):
-            self.start_single_import(target_engine, file, full_load)
+            self.start_single_import(target_engine, file, full_refresh)
 
-    def start_single_import(self, target_engine, configuration_name, full_load):
+    def start_single_import(self, target_engine, configuration_name, requested_full_refresh):
 
         with open("{0}{1}".format(self.configuration_path, configuration_name)) as json_data:
             pipeline_configuration = json.load(json_data)
 
-        data_load_tracker = DataLoadTracker(configuration_name, json_data, full_load)
-
-        self.logger.debug("Execute Starting")
+        self.logger.info("Execute Starting for: {0} requested_full_refresh: {1}".format(configuration_name, requested_full_refresh))
 
         destination_table_manager = DestinationTableManager(target_engine)
 
+        full_refresh = requested_full_refresh
+        if not requested_full_refresh and not destination_table_manager.table_exists(pipeline_configuration['target_schema'],
+                                                                                     pipeline_configuration['stage_table']):
+            self.logger.warning("The load table {0}.{1} does not exist. Swapping to full-refresh mode".format(pipeline_configuration['target_schema'],
+                                                                                                              pipeline_configuration['stage_table']))
+            full_refresh = True
+
+        data_load_tracker = DataLoadTracker(configuration_name, json_data, full_refresh)
+
         columns = self.data_source.get_valid_columns(pipeline_configuration['source_table'],
                                                      pipeline_configuration['columns'])
 
         destination_table_manager.create_schema(pipeline_configuration['target_schema'])
-        if full_load:
-            self.logger.info("Full-load is set. Recreating the staging table.")
-            destination_table_manager.create_table(pipeline_configuration['target_schema'],
-                                                   pipeline_configuration['stage_table'],
-                                                   columns, drop_first=True)
+
+        self.logger.info("Recreating the staging table {0}.{1}".format(pipeline_configuration['target_schema'], pipeline_configuration['stage_table']))
+        destination_table_manager.create_table(pipeline_configuration['target_schema'],
+                                               pipeline_configuration['stage_table'],
+                                               columns, drop_first=True)
+
 
         # Import the data.
         batch_data_loader = BatchDataLoader(self.data_source,
@@ -53,15 +61,18 @@ def start_single_import(self, target_engine, configuration_name, full_load):
 
         self.logger.info("ImportBatch Completed")
 
-        if full_load:
-            #return
+        if full_refresh:
             # Rename the stage table to the load table.
             self.logger.info("Full-load is set. Renaming the stage table to the load table.")
             destination_table_manager.rename_table(pipeline_configuration['target_schema'],
                                                    pipeline_configuration['stage_table'],
                                                    pipeline_configuration['load_table'])
-        #else:
-            # upsert_data_from_stage_to_load_tables(pipeline_configuration['stage_source_data'], pipeline_configuration['load_source_data'])
+        else:
+            self.logger.info("Incremental-load is set. Upserting from the stage table to the load table.")
+            destination_table_manager.upsert_table(pipeline_configuration['target_schema'],
+                                                   pipeline_configuration['stage_table'],
+                                                   pipeline_configuration['load_table'],
+                                                   pipeline_configuration['columns'])
 
         data_load_tracker.completed_successfully()
         self.logger.info(data_load_tracker.get_statistics())
diff --git a/relational_data_loader_project/DestinationTableManager.py b/relational_data_loader_project/DestinationTableManager.py
@@ -4,16 +4,21 @@
 import importlib
 from sqlalchemy.sql import func
 import io
+import os
+
 
 class DestinationTableManager(object):
+    TIMESTAMP_COLUMN_NAME = "data_pipeline_timestamp"
+
     def __init__(self, target_engine, logger=None):
         self.logger = logger or logging.getLogger(__name__)
         self.target_engine = target_engine
 
     def create_schema(self, schema_name):
-
         self.target_engine.execute("CREATE SCHEMA IF NOT EXISTS {0}".format(schema_name))
-        #self.target_engine.execute(CreateSchema(schema_name))
+
+    def table_exists(self, schema_name, table_name):
+        return self.target_engine.dialect.has_table(self.target_engine, "{0}.{1}".format(schema_name, table_name))
 
     def create_table(self, schema_name, table_name, columns_configuration, drop_first):
         metadata = MetaData()
@@ -24,7 +29,7 @@ def create_table(self, schema_name, table_name, columns_configuration, drop_firs
             table.append_column(self.create_column(column_configuration['destination']))
 
         table.append_column(
-            Column("data_pipeline_timestamp", DateTime(timezone=True), server_default=func.now()))
+            Column(self.TIMESTAMP_COLUMN_NAME, DateTime(timezone=True), server_default=func.now()))
 
         if drop_first:
             self.logger.info(
@@ -72,10 +77,12 @@ def rename_table(self, schema_name, source_table_name, target_table_name):
         sql_builder.write("BEGIN TRANSACTION; ")
 
         # Step 3
-        sql_builder.write("ALTER TABLE {0}.{1} RENAME TO {2}; ".format(schema_name, target_table_name, old_load_table_name))
+        sql_builder.write(
+            "ALTER TABLE IF EXISTS {0}.{1} RENAME TO {2}; ".format(schema_name, target_table_name, old_load_table_name))
 
         # Step 4
-        sql_builder.write("ALTER TABLE {0}.{1} RENAME TO {2}; ".format(schema_name, source_table_name, target_table_name))
+        sql_builder.write(
+            "ALTER TABLE {0}.{1} RENAME TO {2}; ".format(schema_name, source_table_name, target_table_name))
 
         sql_builder.write("COMMIT TRANSACTION; ")
         self.logger.debug("Table Rename, executing {0}".format(sql_builder.getvalue()))
@@ -85,7 +92,34 @@ def rename_table(self, schema_name, source_table_name, target_table_name):
 
         sql = "DROP TABLE IF EXISTS {0}.{1} ".format(schema_name, old_load_table_name)
         self.logger.debug("Table Rename, executing {0}".format(sql))
+        self.target_engine.execute(sql)
+
+    def upsert_table(self, schema_name, source_table_name, target_table_name, columns_configuration):
+        column_array = list(map(lambda column: column['destination']['name'], columns_configuration))
+        column_list = ','.join(map(str, column_array))
+        column_list = column_list + ",{0}".format(self.TIMESTAMP_COLUMN_NAME)
+
+        primary_key_column_array = [column_configuration['destination']['name'] for column_configuration in columns_configuration if 'primary_key' in column_configuration['destination'] and column_configuration['destination']['primary_key']]
+
+        primary_key_column_list = ','.join(map(str, primary_key_column_array))
+
+        sql_builder = io.StringIO()
+        sql_builder.write("INSERT INTO {0}.{1} ({2})".format(schema_name, target_table_name, column_list))
+        sql_builder.write(os.linesep)
+        sql_builder.write(" SELECT {0} FROM {1}.{2}".format(column_list, schema_name, source_table_name))
+        sql_builder.write(os.linesep)
+        sql_builder.write(" ON CONFLICT({0}) DO UPDATE SET ".format(primary_key_column_list))
+
+        for column_configuratiomn in columns_configuration:
+            sql_builder.write("{0} = EXCLUDED.{0},".format(column_configuratiomn['destination']['name']))
+            sql_builder.write(os.linesep)
+
+        sql_builder.write("{0} = EXCLUDED.{0}".format(self.TIMESTAMP_COLUMN_NAME))
+
+        self.logger.debug("Upsert executing {0}".format(sql_builder.getvalue()))
+        self.target_engine.execute(sql_builder.getvalue())
+
+        sql_builder.close()
 
-    def upsert_data_from_stage_to_load_tables(self, source_table_configuration, target_table_configuration):
-        print('TODO - create a method to upsert the data;')
-        return;
+    def bob(self, x):
+        print(x)
diff --git a/relational_data_loader_project/RelationalDataLoader.py b/relational_data_loader_project/RelationalDataLoader.py
@@ -22,7 +22,7 @@ def main(self):
         destination_engine = create_engine(args['destination-engine'])
 
         data_load_manager = DataLoadManager(args['configuration-folder'], data_source)
-        data_load_manager.start_imports(destination_engine, True)
+        data_load_manager.start_imports(destination_engine, args['full_refresh'])
 
     def configure_logging(self, log_level):
         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@@ -55,18 +55,33 @@ def get_arguments(self):
 
         parser.add_argument('source-connection-string', metavar='source-connection-string',
                             type=self.raw_connection_string_to_valid_source_connection_string,
-                            help='The source connections string. Eg: mssql+pyodbc://dwsource or csv://c://some//Path//To//Csv//Files//')
+                            help='The source connections string. Eg: mssql+pyodbc://dwsource or '
+                                 'csv://c://some//Path//To//Csv//Files//')
 
         parser.add_argument('destination-engine', metavar='destination-engine',
                             help='The destination engine. Eg: postgresql+psycopg2://postgres:xxxx@localhost/dest_dw')
 
         parser.add_argument('configuration-folder', metavar='configuration-folder',
-                            help='The configuration folder. Eg C:\\_dev\\oscars-misc\\el-pipeline-spike\\configuraton\\')
+                            help='The configuration folder. Eg C:\\_dev\\oscars-misc\\el-pipeline-spike\\configuration\\')
 
         parser.add_argument('--log-level',
                             default='INFO',
                             type=self.log_level_string_to_int,
                             nargs='?',
                             help='Set the logging output level. {0}'.format(_LOG_LEVEL_STRINGS))
 
+        parser.add_argument("--full-refresh", type=self.str2bool, nargs='?',
+                            const=True, default=False,
+                            help='If true, a full refresh of the destination will be performed. This drops/re-creates '
+                                 'the destination table(s).')
+
+
         return vars(parser.parse_args())
+
+    def str2bool(v):
+        if v.lower() in ('yes', 'true', 't', 'y', '1'):
+            return True
+        elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+            return False
+        else:
+            raise argparse.ArgumentTypeError('Boolean value expected.')
diff --git a/test_data/incremental_refresh/ColumnTest.csv b/test_data/incremental_refresh/ColumnTest.csv
@@ -0,0 +1,4 @@
+id,StringColumn1,IntColumn1,DecimalColumn1,DateColumn1,DateTimeColumn1
+5,"This row WAS updated in the incremental review test",,,,
+6,"A Basic String",111,12.1212,01-Dec-1976,01-dec-1976 1:00 am
+7,"Another Basic String",111,12.1212,01-Dec-1976,01-dec-1976 1:00 am

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +py relational_data_loader_project\__main__.py csv://./test_data/incremental_refresh postgresql+psycopg2://postgres:xxxx@localhost/dest_dw c:\_dev\relational-data-loader\configuration\ --log-level DEBUG
++
++
++
++