Merge pull request #2 from PageUpPeopleOrg/OSC-922-FixNullInts

DavidAmesPup · web-flow · commit cc88ac6dd39e · 2018-06-15T06:29:05.000+10:00
Osc 922 fix null ints
diff --git a/README.md b/README.md
@@ -32,8 +32,34 @@ In the above example, dwsource is a 64bit ODBC system dsn
 Run with  `--log-level DEBUG` on the command line.
 
 
-##Other Notes
-###Testing
+## Other Notes
+### Testing
 The test batch files assume there is a user by the name of `postgres` on the system.
 It also sends through a nonense password - it is assumed that the target system is running in 'trust' mode.
-See https://www.postgresql.org/docs/9.1/static/auth-pg-hba-conf.html for details on trust mode
+See https://www.postgresql.org/docs/9.1/static/auth-pg-hba-conf.html for details on trust mode
+
+
+
+### Destination.Type Values
+The destination.type value controls both the data reader type and the destination column type. They are mapped as followed
+
+| destination.type            | pandas type | sqlalchemy type                       | dw column type | notes                                            |
+|-----------------------------|-------------|---------------------------------------|----------------|--------------------------------------------------|
+| string                      | str         | citext.CIText                         | citext         | A case-insensitive string that supports unicode  |
+| int (when nullable = false) | int         | sqlalchemy.Integer                    | int            | An (optionally) signed INT value                 |  
+| int (when nullable = true)  | object      | sqlalchemy.Integer                    | int            | An (optionally) signed INT value                 |  
+| datetime                    | str         | sqlalchemy.DateTime                   | datetime (tz?) |                                                  | 
+| json                        | str         | sqlalchemy.dialects.postgresql.JSONB  | jsonb          | Stored as binary-encoded json on the database    |
+| numeric                     | float       | sqlalchemy.Numeric                    | numeric        | Stores whole and decimal numbers                 |
+| guid                        | str         | sqlalchemy.dialects.postgresql.UUID   | uuid           | |
+| bigint                      | int         | sqlalchemy.BigInteger                 | BigInt         | Relies on 64big python. Limited to largest number of ~2147483647121212|
+
+
+These are implemented in Column_Type_Resolver.py
+
+
+                   
+
+                   
+
+
diff --git a/integration_tests/csv_source/assertions/column_test_full_refresh_assertions.sql b/integration_tests/csv_source/assertions/column_test_full_refresh_assertions.sql
@@ -3,20 +3,20 @@ SET client_encoding TO 'UTF8';
 DROP TABLE IF EXISTS results;
 
 CREATE TEMPORARY TABLE results AS
-  WITH expected(id, int_column_1, date_column_1, decimal_column_1, date_time_column_1, string_column_1) AS (
-    SELECT 1, 111.0, '1976-12-01'::DATE, 12.1212, '1976-12-01 01:00:00.000000'::TIMESTAMP, 'A Basic String'
+  WITH expected(id, int_column_1, date_column_1, decimal_column_1, date_time_column_1, string_column_1, guid_column_1,big_int_column_1) AS (
+    SELECT 1, 111.0, '1976-12-01'::DATE, 12.1212, '1976-12-01 01:00:00.000000'::TIMESTAMP, 'A Basic String', '57bc8093-fe4c-477a-bbd7-fb5c02055a7e'::UUID,2147483647121212
     UNION ALL
-    SELECT 2, NULL, NULL, NULL, NULL, NULL
+    SELECT 2, NULL, NULL, NULL, NULL, NULL, NULL, NULL
     UNION ALL
-    SELECT 3, 333.0, '2001-01-01', 33.333, NULL, 'This Text Has a Quote Before "Dave'
+    SELECT 3, 333.0, '2001-01-01', 33.333, NULL, 'This Text Has a Quote Before "Dave', NULL, NULL
     UNION ALL
-    SELECT 4, NULL, NULL, NULL, NULL, 'ം ഃ അ ആ ഇ ഈ ഉ ഊ ഋ ഌ എ ഏ'
+    SELECT 4, NULL, NULL, NULL, NULL, 'ം ഃ അ ആ ഇ ഈ ഉ ഊ ഋ ഌ എ ഏ','aabc8093-fe4c-477a-bbd7-fb5c02055a7e', NULL
     UNION ALL
-    SELECT 5, NULL, NULL, NULL, NULL, 'This row will be updated in the incremental review test'
+    SELECT 5, NULL, NULL, NULL, NULL, 'This row will be updated in the incremental review test', NULL, NULL
   ),
 
   actual AS (
-    SELECT  id, int_column_1, date_column_1, decimal_column_1, date_time_column_1, string_column_1
+    SELECT  id, int_column_1, date_column_1, decimal_column_1, date_time_column_1, string_column_1,guid_column_1,big_int_column_1
     FROM    rdl_integration_tests.load_source_data
   )
 
diff --git a/integration_tests/csv_source/assertions/column_test_incremental_refresh_assertions.sql b/integration_tests/csv_source/assertions/column_test_incremental_refresh_assertions.sql
@@ -2,24 +2,24 @@ SET client_encoding TO 'UTF8';
 DROP TABLE IF EXISTS results;
 
 CREATE TEMPORARY TABLE results AS
-  WITH expected(id, int_column_1, date_column_1, decimal_column_1, date_time_column_1, string_column_1) AS (
-    SELECT 1, 111.0, '1976-12-01'::DATE, 12.1212, '1976-12-01 01:00:00.000000'::TIMESTAMP, 'A Basic String'
+  WITH expected(id, int_column_1, date_column_1, decimal_column_1, date_time_column_1, string_column_1, guid_column_1,big_int_column_1) AS (
+    SELECT 1, 111.0, '1976-12-01'::DATE, 12.1212, '1976-12-01 01:00:00.000000'::TIMESTAMP, 'A Basic String', '57bc8093-fe4c-477a-bbd7-fb5c02055a7e'::UUID,2147483647121212
     UNION ALL
-    SELECT 2, NULL, NULL, NULL, NULL, NULL
+    SELECT 2, NULL, NULL, NULL, NULL, NULL, NULL, NULL
     UNION ALL
-    SELECT 3, 333.0, '2001-01-01', 33.333, NULL, 'This Text Has a Quote Before "Dave'
+    SELECT 3, 333.0, '2001-01-01', 33.333, NULL, 'This Text Has a Quote Before "Dave', NULL, NULL
     UNION ALL
-    SELECT 4, NULL, NULL, NULL, NULL, 'ം ഃ അ ആ ഇ ഈ ഉ ഊ ഋ ഌ എ ഏ'
+    SELECT 4, NULL, NULL, NULL, NULL, 'ം ഃ അ ആ ഇ ഈ ഉ ഊ ഋ ഌ എ ഏ', 'aabc8093-fe4c-477a-bbd7-fb5c02055a7e', NULL
     UNION ALL
-    SELECT 5, NULL, NULL, NULL, NULL, 'This row WAS updated in the incremental review test'
+    SELECT 5, NULL, NULL, NULL, NULL, 'This row WAS updated in the incremental review test', NULL, NULL
     UNION ALL
-    SELECT 6, 111.0, '1976-12-01'::DATE, 12.1212, '1976-12-01 01:00:00.000000'::TIMESTAMP, 'A Basic String'
+    SELECT 6, 111.0, '1976-12-01'::DATE, 12.1212, '1976-12-01 01:00:00.000000'::TIMESTAMP, 'A Basic String', '57bc8093-fe4c-477a-bbd7-fb5c02055a7e', NULL
     UNION ALL
-    SELECT 7, 111.0, '1976-12-01'::DATE, 12.1212, '1976-12-01 01:00:00.000000'::TIMESTAMP, 'Another Basic String'
+    SELECT 7, 111.0, '1976-12-01'::DATE, 12.1212, '1976-12-01 01:00:00.000000'::TIMESTAMP, 'Another Basic String', NULL, NULL
   ),
 
   actual AS (
-    SELECT  id, int_column_1, date_column_1, decimal_column_1, date_time_column_1, string_column_1
+    SELECT  id, int_column_1, date_column_1, decimal_column_1, date_time_column_1, string_column_1, guid_column_1,big_int_column_1
     FROM    rdl_integration_tests.load_source_data
   )
 
diff --git a/integration_tests/csv_source/config/ColumnTest.json b/integration_tests/csv_source/config/ColumnTest.json
@@ -17,7 +17,7 @@
       "source_name": "id",
       "destination": {
         "name": "id",
-        "type": "sqlalchemy.Integer",
+        "type": "int",
         "nullable": false,
         "primary_key": true
       }
@@ -26,41 +26,61 @@
       "source_name": "IntColumn1",
       "destination": {
         "name": "int_column_1",
-        "type": "sqlalchemy.Numeric",
+        "type": "int",
         "nullable": true
       }
     },
     {
       "source_name": "DateColumn1",
       "destination": {
         "name": "date_column_1",
-        "type": "sqlalchemy.DateTime",
+        "type": "datetime",
         "nullable": true
       }
     },
     {
       "source_name": "DecimalColumn1",
       "destination": {
         "name": "decimal_column_1",
-        "type": "sqlalchemy.Numeric",
+        "type": "numeric",
         "nullable": true
       }
     },
     {
       "source_name": "DateTimeColumn1",
       "destination": {
         "name": "date_time_column_1",
-        "type": "sqlalchemy.DateTime",
+        "type": "datetime",
         "nullable": true
       }
     },
      {
       "source_name": "StringColumn1",
       "destination": {
         "name": "string_column_1",
-        "type": "citext.CIText",
+        "type": "string",
+        "nullable": true
+      }
+    },
+     {
+      "source_name": "GuidColumn1",
+      "destination": {
+        "name": "guid_column_1",
+        "type": "guid",
+        "nullable": true
+      }
+    },
+     {
+      "source_name": "BigIntColumn1",
+      "destination": {
+        "name": "big_int_column_1",
+        "type": "bigint",
         "nullable": true
       }
     }
+
+
+
+
   ]
 }
diff --git a/integration_tests/csv_source/full_refresh_data/ColumnTest.csv b/integration_tests/csv_source/full_refresh_data/ColumnTest.csv
@@ -1,6 +1,6 @@
-id,StringColumn1,IntColumn1,DecimalColumn1,DateColumn1,DateTimeColumn1
-1,"A Basic String",111,12.1212,01-Dec-1976,01-dec-1976 1:00 am
-2,,,,,
-3,"This Text Has a Quote Before ""Dave", 333,33.333, 01-01-01,
-4,"ം ഃ അ ആ ഇ ഈ ഉ ഊ ഋ ഌ എ ഏ",,,,
-5,"This row will be updated in the incremental review test"
+id,StringColumn1,IntColumn1,DecimalColumn1,DateColumn1,DateTimeColumn1,GuidColumn1,BigIntColumn1
+1,"A Basic String",111,12.1212,01-Dec-1976,01-dec-1976 1:00 am,57BC8093-FE4C-477A-BBD7-FB5C02055A7E,2147483647121212
+2,,,,,,,
+3,"This Text Has a Quote Before ""Dave", 333,33.333, 01-01-01,,,
+4,"ം ഃ അ ആ ഇ ഈ ഉ ഊ ഋ ഌ എ ഏ",,,,,AABC8093-FE4C-477A-BBD7-FB5C02055A7E,
+5,"This row will be updated in the incremental review test",,
diff --git a/integration_tests/csv_source/incremental_refresh_data/ColumnTest.csv b/integration_tests/csv_source/incremental_refresh_data/ColumnTest.csv
@@ -1,4 +1,4 @@
-id,StringColumn1,IntColumn1,DecimalColumn1,DateColumn1,DateTimeColumn1
-5,"This row WAS updated in the incremental review test",,,,
-6,"A Basic String",111,12.1212,01-Dec-1976,01-dec-1976 1:00 am
-7,"Another Basic String",111,12.1212,01-Dec-1976,01-dec-1976 1:00 am
+id,StringColumn1,IntColumn1,DecimalColumn1,DateColumn1,DateTimeColumn1,GuidColumn1,BigIntColumn1
+5,"This row WAS updated in the incremental review test",,,,,,
+6,"A Basic String",111,12.1212,01-Dec-1976,01-dec-1976 1:00 am,57BC8093-FE4C-477A-BBD7-FB5C02055A7E,
+7,"Another Basic String",111,12.1212,01-Dec-1976,01-dec-1976 1:00 am,,
diff --git a/integration_tests/mssql_source/config/LargeTableTest.json b/integration_tests/mssql_source/config/LargeTableTest.json
@@ -17,7 +17,7 @@
       "source_name": "Id",
       "destination": {
         "name": "id",
-        "type": "sqlalchemy.Integer",
+        "type": "int",
         "nullable": false,
         "primary_key": true
       }
@@ -26,47 +26,47 @@
       "source_name": "DateColumn1",
       "destination": {
         "name": "date_column_1",
-        "type": "sqlalchemy.DateTime",
+        "type": "datetime",
         "nullable": true
       }
     },
     {
       "source_name": "IntColumn1",
       "destination": {
         "name": "int_column_1",
-        "type": "sqlalchemy.Numeric",
+        "type": "int",
         "nullable": true
       }
     },
     {
       "source_name": "DateColumn2",
       "destination": {
         "name": "date_column_2",
-        "type": "sqlalchemy.DateTime",
+        "type": "datetime",
         "nullable": true
       }
     },
      {
       "source_name": "StringColumn1",
       "destination": {
         "name": "string_column_1",
-        "type": "citext.CIText",
+        "type": "string",
         "nullable": true
       }
     },
     {
       "source_name": "StringColumn2",
       "destination": {
         "name": "string_column_2",
-        "type": "citext.CIText",
+        "type": "string",
         "nullable": true
       }
     },
     {
       "source_name": "GuidColumn",
       "destination": {
         "name": "guid_column",
-        "type": "citext.CIText",
+        "type": "guid",
         "nullable": true
       }
     }
diff --git a/modules/BatchDataLoader.py b/modules/BatchDataLoader.py
@@ -5,7 +5,8 @@
 
 
 class BatchDataLoader(object):
-    def __init__(self, data_source, source_table_configuration, target_schema, target_table, columns, data_load_tracker, batch_configuration, target_engine, logger=None):
+    def __init__(self, data_source, source_table_configuration, target_schema, target_table, columns, data_load_tracker,
+                 batch_configuration, target_engine, logger=None):
         self.logger = logger or logging.getLogger(__name__)
         self.source_table_configuration = source_table_configuration
         self.columns = columns
@@ -22,7 +23,8 @@ def load_batch(self, previous_batch_key):
 
         self.logger.debug("ImportBatch Starting from previous_batch_key: {0}".format(previous_batch_key))
 
-        data_frame = self.data_source.get_next_data_frame(self.source_table_configuration, self.columns, self.batch_configuration, batch_tracker, previous_batch_key)
+        data_frame = self.data_source.get_next_data_frame(self.source_table_configuration, self.columns,
+                                                          self.batch_configuration, batch_tracker, previous_batch_key)
 
         if data_frame is None or len(data_frame) == 0:
             self.logger.debug("There are no rows to import, returning -1")
@@ -42,12 +44,16 @@ def write_data_frame_to_table(self, data_frame):
         qualified_target_table = "{0}.{1}".format(self.target_schema, self.target_table)
         self.logger.debug("Starting write to table {0}".format(qualified_target_table))
         data = StringIO()
-        data_frame.to_csv(data, header=False, index=False, na_rep='')
+
+        data_frame.to_csv(data, header=False, index=False, na_rep='', float_format='%.16g')
+        # Float_format is used to truncate any insignificant digits. Unfortunately it gives us an artificial limitation
+
         data.seek(0)
         raw = self.target_engine.raw_connection()
         curs = raw.cursor()
 
-        column_array = list(map(lambda source_colum_name: self.get_destination_column_name(source_colum_name), data_frame.columns))
+        column_array = list(
+            map(lambda source_colum_name: self.get_destination_column_name(source_colum_name), data_frame.columns))
         column_list = ','.join(map(str, column_array))
 
         sql = "COPY {0}({1}) FROM STDIN with csv".format(qualified_target_table, column_list)
diff --git a/modules/ColumnTypeResolver.py b/modules/ColumnTypeResolver.py
@@ -0,0 +1,39 @@
+import citext
+from sqlalchemy import DateTime, Numeric, Integer, BigInteger
+from sqlalchemy.dialects.postgresql import JSONB
+from sqlalchemy.dialects.postgresql import UUID
+
+
+class ColumnTypeResolver(object):
+    PANDAS_TYPE_MAP = {'string': str,
+                       'datetime': str,
+                       'json': str,
+                       'numeric': float,
+                       'guid': str,
+                       'bigint': int}
+
+    POSTGRES_TYPE_MAP = {'string': citext.CIText,
+                         'datetime': DateTime,
+                         'json': JSONB,
+                         'numeric': Numeric,
+                         'guid': UUID,
+                         'int': Integer,
+                         'bigint': BigInteger}
+
+    def resolve_postgres_type(self, column):
+        return self.POSTGRES_TYPE_MAP[column['type']]
+
+    def resolve_pandas_type(self, column):
+        if column['type'] == 'int':
+            if column['nullable']:
+                return object
+            else:
+                return int
+        else:
+            return self.PANDAS_TYPE_MAP[column['type']]
+
+    def create_column_type_dictionary(self, columns):
+        types = {}
+        for column in columns:
+            types[column['source_name']] = self.resolve_pandas_type(column['destination'])
+        return types
diff --git a/modules/DataLoadManager.py b/modules/DataLoadManager.py
@@ -40,13 +40,10 @@ def start_single_import(self, target_engine, configuration_name, requested_full_
 
         data_load_tracker = DataLoadTracker(configuration_name, json_data, full_refresh)
 
-        columns = self.data_source.get_valid_columns(pipeline_configuration['source_table'],
+        self.data_source.assert_data_source_is_valid(pipeline_configuration['source_table'],
                                                      pipeline_configuration['columns'])
 
-        if columns is None:
-            self.logger.debug("There are no columns, returning.")
-            return
-
+        columns = pipeline_configuration['columns']
         destination_table_manager.create_schema(pipeline_configuration['target_schema'])
 
         self.logger.debug("Recreating the staging table {0}.{1}".format(pipeline_configuration['target_schema'], pipeline_configuration['stage_table']))
diff --git a/modules/DestinationTableManager.py b/modules/DestinationTableManager.py
diff --git a/modules/data_sources/CsvDataSource.py b/modules/data_sources/CsvDataSource.py
diff --git a/modules/data_sources/MsSqlDataSource.py b/modules/data_sources/MsSqlDataSource.py
diff --git a/setup.py b/setup.py
diff --git a/test_full_refresh_from_mssql.cmd b/test_full_refresh_from_mssql.cmd

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`"source_name": "Id",`
`18`	`18`	`"destination": {`
`19`	`19`	`"name": "id",`
`20`		`- "type": "sqlalchemy.Integer",`
	`20`	`+ "type": "int",`
`21`	`21`	`"nullable": false,`
`22`	`22`	`"primary_key": true`
`23`	`23`	`}`
`@@ -26,47 +26,47 @@`
`26`	`26`	`"source_name": "DateColumn1",`
`27`	`27`	`"destination": {`
`28`	`28`	`"name": "date_column_1",`
`29`		`- "type": "sqlalchemy.DateTime",`
	`29`	`+ "type": "datetime",`
`30`	`30`	`"nullable": true`
`31`	`31`	`}`
`32`	`32`	`},`
`33`	`33`	`{`
`34`	`34`	`"source_name": "IntColumn1",`
`35`	`35`	`"destination": {`
`36`	`36`	`"name": "int_column_1",`
`37`		`- "type": "sqlalchemy.Numeric",`
	`37`	`+ "type": "int",`
`38`	`38`	`"nullable": true`
`39`	`39`	`}`
`40`	`40`	`},`
`41`	`41`	`{`
`42`	`42`	`"source_name": "DateColumn2",`
`43`	`43`	`"destination": {`
`44`	`44`	`"name": "date_column_2",`
`45`		`- "type": "sqlalchemy.DateTime",`
	`45`	`+ "type": "datetime",`
`46`	`46`	`"nullable": true`
`47`	`47`	`}`
`48`	`48`	`},`
`49`	`49`	`{`
`50`	`50`	`"source_name": "StringColumn1",`
`51`	`51`	`"destination": {`
`52`	`52`	`"name": "string_column_1",`
`53`		`- "type": "citext.CIText",`
	`53`	`+ "type": "string",`
`54`	`54`	`"nullable": true`
`55`	`55`	`}`
`56`	`56`	`},`
`57`	`57`	`{`
`58`	`58`	`"source_name": "StringColumn2",`
`59`	`59`	`"destination": {`
`60`	`60`	`"name": "string_column_2",`
`61`		`- "type": "citext.CIText",`
	`61`	`+ "type": "string",`
`62`	`62`	`"nullable": true`
`63`	`63`	`}`
`64`	`64`	`},`
`65`	`65`	`{`
`66`	`66`	`"source_name": "GuidColumn",`
`67`	`67`	`"destination": {`
`68`	`68`	`"name": "guid_column",`
`69`		`- "type": "citext.CIText",`
	`69`	`+ "type": "guid",`
`70`	`70`	`"nullable": true`
`71`	`71`	`}`
`72`	`72`	`}`