pageuppeople-opensource
diff --git a/‎README.md‎
Lines changed: 24 additions & 5 deletions b/‎README.md‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎modules/DataLoadManager.py‎
Lines changed: 1 addition & 0 deletions b/‎modules/DataLoadManager.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modules/RelationalDataLoader.py‎
Lines changed: 94 additions & 33 deletions b/‎modules/RelationalDataLoader.py‎
Lines changed: 94 additions & 33 deletions
diff --git a/‎modules/data_load_tracking/DataLoadTrackerRepository.py‎
Lines changed: 43 additions & 2 deletions b/‎modules/data_load_tracking/DataLoadTrackerRepository.py‎
Lines changed: 43 additions & 2 deletions
diff --git a/‎modules/data_sources/MsSqlDataSource.py‎
Lines changed: 2 additions & 1 deletion b/‎modules/data_sources/MsSqlDataSource.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎modules/tests/config/connection.json.template‎
Lines changed: 10 additions & 3 deletions b/‎modules/tests/config/connection.json.template‎
Lines changed: 10 additions & 3 deletions
@@ -9,11 +9,9 @@ A utility for taking data from MS-SQL and loading it into PostgreSQL
 `py rdl.py --help`
 
 ```text
-usage: rdl.py [-h] [-f [FORCE_FULL_REFRESH_MODELS]] [-l [LOG_LEVEL]]
-              source-connection-string destination-connection-string
-              configuration-folder
-
-Relational Data Loader
+usage: rdl.py process [-h] [-f [FORCE_FULL_REFRESH_MODELS]] [-l [LOG_LEVEL]]
+                      source-connection-string destination-connection-string
+                      configuration-folder
 
 positional arguments:
   source-connection-string
@@ -40,6 +38,27 @@ optional arguments:
   -l [LOG_LEVEL], --log-level [LOG_LEVEL]
                         Set the logging output level. ['CRITICAL', 'ERROR',
                         'WARNING', 'INFO', 'DEBUG']
+
+usage: rdl.py audit [-h] [-l [LOG_LEVEL]]
+                    destination-connection-string model-type timestamp
+
+positional arguments:
+  destination-connection-string
+                        The destination database connection string. Provide in
+                        PostgreSQL + Psycopg format. Eg: 'postgresql+psycopg2:
+                        //username:password@host:port/dbname'
+  model-type            Use the command FULL to return full refresh models or
+                        the command INCR to return only the incremental models
+                        since the timestamp
+  timestamp             ISO 8601 datetime with timezone (`yyyy-mm-ddThh:mm:ss.nnnnnn+|-hh:mm`) used to provide information on all
+                        actions since the specified date. Eg
+                        '2019-02-14T01:55:54.123456+00:00'.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l [LOG_LEVEL], --log-level [LOG_LEVEL]
+                        Set the logging output level. ['CRITICAL', 'ERROR',
+                        'WARNING', 'INFO', 'DEBUG']
 ```
 
 _Notes:_
 
@@ -4,6 +4,7 @@
 import hashlib
 from pathlib import Path
 from json import JSONDecodeError
+
 from modules.BatchDataLoader import BatchDataLoader
 from modules.DestinationTableManager import DestinationTableManager
 from modules.data_load_tracking.DataLoadTracker import DataLoadTracker
 
@@ -1,5 +1,6 @@
 import logging
 import argparse
+from datetime import datetime
 from sqlalchemy import create_engine
 from modules.DataLoadManager import DataLoadManager
 from modules.shared import Constants
@@ -8,6 +9,10 @@
 from sqlalchemy.orm import sessionmaker
 
 _LOG_LEVEL_STRINGS = ['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG']
+_AUDIT_FUNCTION_OPTIONS = {
+    'FULL': DataLoadTrackerRepository.get_full_refresh_since,
+    'INCR': DataLoadTrackerRepository.get_only_incremental_since,
+}
 
 
 class RelationalDataLoader:
@@ -16,19 +21,33 @@ def __init__(self, logger=None):
         self.data_source_factory = DataSourceFactory()
 
     def main(self):
-        args = self.get_arguments()
+        self.args = self.get_arguments()
 
-        self.configure_root_logger(args.log_level)
+        self.configure_root_logger(self.args.log_level)
 
-        source_db = self.data_source_factory.create_source(args.source_connection_string)
+        self.args.func()
 
-        destination_db = create_engine(args.destination_connection_string)
+    def execute_process_command(self):
+        source_db = self.data_source_factory.create_source(self.args.source_connection_string)
+
+        destination_db = create_engine(self.args.destination_connection_string)
         session_maker = sessionmaker(bind=destination_db)
         repository = DataLoadTrackerRepository(session_maker)
         repository.ensure_schema_exists(destination_db)
 
-        data_load_manager = DataLoadManager(args.configuration_folder, source_db, destination_db, repository)
-        data_load_manager.start_imports(args.force_full_refresh_models)
+        data_load_manager = DataLoadManager(self.args.configuration_folder, source_db, destination_db, repository)
+        data_load_manager.start_imports(self.args.force_full_refresh_models)
+
+    def execute_audit_command(self):
+        destination_db = create_engine(self.args.destination_connection_string)
+        session_maker = sessionmaker(bind=destination_db)
+        data_load_tracker_repository = DataLoadTrackerRepository(session_maker)
+
+        last_successful_timestamp = datetime.fromisoformat(self.args.timestamp)
+
+        results = _AUDIT_FUNCTION_OPTIONS[self.args.model_type](data_load_tracker_repository, last_successful_timestamp)
+
+        print(results.join(" "))
 
     def configure_root_logger(self, log_level):
         # get the root logger
@@ -67,39 +86,81 @@ def raw_connection_string_to_valid_source_connection_string(self, connection_str
     def get_arguments(self):
         parser = argparse.ArgumentParser(description=Constants.APP_NAME)
 
-        parser.add_argument(
+        subparsers = parser.add_subparsers(title='commands', metavar='', dest='command')
+
+        process_command_parser = subparsers.add_parser('process', help='processes load models')
+        process_command_parser.set_defaults(func=self.execute_process_command)
+
+        process_command_parser.add_argument(
             'source_connection_string',
             metavar='source-connection-string',
             type=self.raw_connection_string_to_valid_source_connection_string,
             help='The source connections string as a 64bit ODBC system dsn. Eg: mssql+pyodbc://dwsource or '
             'csv://c://some//Path//To//Csv//Files//')
 
-        parser.add_argument('destination_connection_string',
-                            metavar='destination-connection-string',
-                            help='The destination database connection string. Provide in PostgreSQL + Psycopg format. '
-                                 'Eg: \'postgresql+psycopg2://username:password@host:port/dbname\'')
-
-        parser.add_argument('configuration_folder',
-                            metavar='configuration-folder',
-                            help='Absolute or relative path to the models. '
-                                 'Eg \'./models\', \'C:/path/to/models\'')
-
-        parser.add_argument('-f',
-                            '--force-full-refresh-models',
-                            nargs='?',
-                            const='*',
-                            help='Comma separated model names in the configuration folder. These models would be '
-                                 'forcefully refreshed dropping and recreating the destination tables. All others '
-                                 'models would only be refreshed if required as per the state of the source and '
-                                 'destination tables. '
-                                 'Eg \'CompoundPkTest,LargeTableTest\'. '
-                                 'Leave blank or use glob (*) to force full refresh of all models.')
-
-        parser.add_argument('-l', '--log-level',
-                            default='INFO',
-                            type=self.log_level_string_to_int,
-                            nargs='?',
-                            help=f'Set the logging output level. {_LOG_LEVEL_STRINGS}')
+        process_command_parser.add_argument(
+            'destination_connection_string',
+            metavar='destination-connection-string',
+            help='The destination database connection string. Provide in PostgreSQL'
+            ' + Psycopg format. '
+            'Eg: \'postgresql+psycopg2://username:password@host:port/dbname\'')
+
+        process_command_parser.add_argument(
+            'configuration_folder',
+            metavar='configuration-folder',
+            help='Absolute or relative path to the models. '
+            'Eg \'./models\', \'C:/path/to/models\'')
+
+        process_command_parser.add_argument(
+            '-f',
+            '--force-full-refresh-models',
+            nargs='?',
+            const='*',
+            help='Comma separated model names in the configuration folder. '
+            'These models would be forcefully refreshed dropping and recreating the '
+            'destination tables. All others models would only be refreshed if required '
+            'as per the state of the source and destination tables. '
+            'Eg \'CompoundPkTest,LargeTableTest\'. '
+            'Leave blank or use glob (*) to force full refresh of all models.')
+
+        process_command_parser.add_argument(
+            '-l', '--log-level',
+            default='INFO',
+            type=self.log_level_string_to_int,
+            nargs='?',
+            help=f'Set the logging output level. {_LOG_LEVEL_STRINGS}')
+
+        audit_command_parser = subparsers.add_parser('audit',
+                                                     help='provides list of processed models since a given timestamp')
+        audit_command_parser.set_defaults(func=self.execute_audit_command)
+
+        audit_command_parser.add_argument(
+            'destination_connection_string',
+            metavar='destination-connection-string',
+            help='The destination database connection string. Provide in PostgreSQL'
+            ' + Psycopg format. '
+            'Eg: \'postgresql+psycopg2://username:password@host:port/dbname\'')
+
+        audit_command_parser.add_argument(
+            'model_type',
+            metavar='model-type',
+            choices=_AUDIT_FUNCTION_OPTIONS.keys(),
+            help='Use the command FULL to return full refresh models or the '
+            'command INCR to return only the incremental models since the timestamp')
+
+        audit_command_parser.add_argument(
+            'timestamp',
+            metavar='timestamp',
+            help='ISO 8601 datetime with timezone (yyyy-mm-ddThh:mm:ss.nnnnnn+|-hh:mm) used to provide information '
+            'on all actions since the specified date. '
+            'Eg \'2019-02-14T01:55:54.123456+00:00\'. ')
+
+        audit_command_parser.add_argument(
+            '-l', '--log-level',
+            default='INFO',
+            type=self.log_level_string_to_int,
+            nargs='?',
+            help=f'Set the logging output level. {_LOG_LEVEL_STRINGS}')
 
         return parser.parse_args()
 
 
@@ -1,8 +1,11 @@
 import logging
-from sqlalchemy import desc
+
 from modules.data_load_tracking.DataLoadExecution import DataLoadExecution, Base
 from modules.shared import Constants
 
+from sqlalchemy import desc
+from sqlalchemy import func
+
 
 class DataLoadTrackerRepository(object):
     def __init__(self, session_maker, logger=None):
@@ -15,10 +18,12 @@ def ensure_schema_exists(self, engine):
 
     def get_last_successful_data_load_execution(self, model_name):
         session = self.session_maker()
-        return session.query(DataLoadExecution)\
+        result = session.query(DataLoadExecution)\
             .filter_by(model_name=model_name, status=Constants.ExecutionStatus.COMPLETED_SUCCESSFULLY)\
             .order_by(desc(DataLoadExecution.completed_on))\
             .first()
+        session.close()
+        return result
 
     def save(self, data_load_tracker):
         data_load_execution = DataLoadExecution(
@@ -36,3 +41,39 @@ def save(self, data_load_tracker):
         session = self.session_maker()
         session.add(data_load_execution)
         session.commit()
+        session.close()
+
+    def get_full_refresh_since(self, timestamp):
+        session = self.session_maker()
+        results = session.query(DataLoadExecution.model_name)\
+            .filter(DataLoadExecution.completed_on > timestamp,
+                    DataLoadExecution.is_full_refresh)\
+            .distinct(DataLoadExecution.model_name)\
+            .group_by(DataLoadExecution.model_name)\
+            .all()
+        session.close()
+        return [r for (r, ) in results]
+
+    def get_incremental_since(self, timestamp):
+        session = self.session_maker()
+        results = session.query(DataLoadExecution.model_name)\
+            .filter(DataLoadExecution.completed_on > timestamp,
+                    DataLoadExecution.is_full_refresh == False,
+                    DataLoadExecution.rows_processed > 0)\
+            .distinct(DataLoadExecution.model_name)\
+            .group_by(DataLoadExecution.model_name)\
+            .all()
+        session.close()
+        return [r for (r, ) in results]
+
+    def get_only_incremental_since(self, timestamp):
+        session = self.session_maker()
+        results = session.query(DataLoadExecution.model_name)\
+            .filter(DataLoadExecution.completed_on > timestamp,
+                    DataLoadExecution.rows_processed > 0)\
+            .distinct(DataLoadExecution.model_name)\
+            .group_by(DataLoadExecution.model_name)\
+            .having(func.bool_and(DataLoadExecution.is_full_refresh == False))\
+            .all()
+        session.close()
+        return [r for (r, ) in results]
@@ -51,7 +51,8 @@ def build_select_statement(self, table_config, columns, batch_config, batch_key_
             order_by_sql = "ORDER BY " + f", {MsSqlDataSource.SOURCE_TABLE_ALIAS}.".join(table_config['primary_keys'])
         else:
             select_sql = f"SELECT TOP ({batch_config['size']}) {column_names}, " \
-                f"{MsSqlDataSource.CHANGE_TABLE_ALIAS}.SYS_CHANGE_VERSION AS {Constants.AuditColumnNames.CHANGE_VERSION}, " \
+                f"{MsSqlDataSource.CHANGE_TABLE_ALIAS}.SYS_CHANGE_VERSION" \
+                f" AS {Constants.AuditColumnNames.CHANGE_VERSION}, " \
                 f"CASE {MsSqlDataSource.CHANGE_TABLE_ALIAS}.SYS_CHANGE_OPERATION WHEN 'D' THEN 1 ELSE 0 " \
                 f"END AS {Constants.AuditColumnNames.IS_DELETED}"
             from_sql = f"FROM CHANGETABLE(CHANGES" \
 
@@ -1,5 +1,12 @@
 {
-  "username": "rdl_test_user",
-  "password": "hunter2",
-  "server_string": "(local)\\SQLEXPRESS"
+  "mssql": {
+    "username": "rdl_test_user",
+    "password": "hunter2",
+    "server_string": "(local)\\SQLEXPRESS"
+  },
+  "psql": {
+    "username": "postgres",
+    "password": "postgres",
+    "server_string": "postgres"
+  }
 }