Skip to content
This repository was archived by the owner on Mar 13, 2020. It is now read-only.

Commit 1c84096

Browse files
author
dames
committed
Basic CSV loading working
1 parent 7fa066d commit 1c84096

File tree

13 files changed

+336
-232
lines changed

13 files changed

+336
-232
lines changed

configuraton/ColumnTest.json

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
{
2+
3+
"source_table": {
4+
"name": "ColumnTest",
5+
"schema": "dbo"
6+
},
7+
"target_schema": "load",
8+
"stage_table": "stage_source_data",
9+
"load_table": "load_source_data",
10+
11+
"batch": {
12+
"size": 100000,
13+
"source_unique_column": "id"
14+
},
15+
"columns": [
16+
{
17+
"source_name": "id",
18+
"destination": {
19+
"name": "id",
20+
"type": "sqlalchemy.Integer",
21+
"nullable": false,
22+
"primary_key": true
23+
}
24+
},
25+
{
26+
"source_name": "IntColumn1",
27+
"destination": {
28+
"name": "int_column_1",
29+
"type": "citext.CIText",
30+
"nullable": true
31+
}
32+
},
33+
{
34+
"source_name": "DateColumn1",
35+
"destination": {
36+
"name": "date_column_1",
37+
"type": "sqlalchemy.DateTime",
38+
"nullable": true
39+
}
40+
},
41+
{
42+
"source_name": "DecimalColumn1",
43+
"destination": {
44+
"name": "decimal_column_1",
45+
"type": "sqlalchemy.Numeric",
46+
"nullable": true
47+
}
48+
},
49+
{
50+
"source_name": "DateTimeColumn1",
51+
"destination": {
52+
"name": "date_time_column_1",
53+
"type": "sqlalchemy.DateTime",
54+
"nullable": true
55+
}
56+
},
57+
{
58+
"source_name": "StringColumn1",
59+
"destination": {
60+
"name": "string_column_1",
61+
"type": "citext.CIText",
62+
"nullable": true
63+
}
64+
}
65+
]
66+
}

configuraton/provider.json

Lines changed: 0 additions & 105 deletions
This file was deleted.

relational_data_loader_project/BatchDataLoader.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,59 +6,69 @@
66

77

88
class BatchDataLoader(object):
9-
def __init__(self, data_source, source_table_configuration, target_table_configuration, columns, data_load_tracker, batch_configuration, target_engine, logger=None):
9+
def __init__(self, data_source, source_table_configuration, target_schema, target_table, columns, data_load_tracker, batch_configuration, target_engine, logger=None):
1010
self.logger = logger or logging.getLogger(__name__)
1111
self.source_table_configuration = source_table_configuration
1212
self.columns = columns
1313
self.data_source = data_source
14-
self.target_table_configuration = target_table_configuration
14+
self.target_schema = target_schema
15+
self.target_table = target_table
1516
self.data_load_tracker = data_load_tracker
1617
self.batch_configuration = batch_configuration
1718
self.target_engine = target_engine
1819

1920
# Imports rows, returns True if >0 rows were found
20-
def import_batch(self, previous_batch_key):
21+
def load_batch(self, previous_batch_key):
2122
batch_tracker = self.data_load_tracker.start_batch()
2223

2324
self.logger.debug("ImportBatch Starting from previous_batch_key: {0}".format(previous_batch_key))
2425

2526
data_frame = self.data_source.get_next_data_frame(self.source_table_configuration, self.columns, self.batch_configuration, batch_tracker, previous_batch_key)
2627

27-
if len(data_frame) == 0:
28+
if data_frame is None or len(data_frame) == 0:
2829
self.logger.debug("There are no rows to import, returning -1")
2930
batch_tracker.load_skipped_due_to_zero_rows()
3031
return -1
3132

3233
data_frame = self.attach_column_transformers(data_frame)
3334

34-
self.write_data_frame_to_table(data_frame, self.target_table_configuration, self.target_engine)
35+
self.write_data_frame_to_table(data_frame)
3536
batch_tracker.load_completed_successfully()
3637

3738
last_key_returned = data_frame.iloc[-1][self.batch_configuration['source_unique_column']]
3839

3940
self.logger.info("Batch key {0} Completed. {1}".format(last_key_returned, batch_tracker.get_statistics()))
4041
return last_key_returned
4142

42-
def write_data_frame_to_table(self, data_frame, table_configuration, target_engine):
43-
destination_table = "{0}.{1}".format(table_configuration['schema'], table_configuration['name'])
44-
self.logger.debug("Starting write to table {0}".format(destination_table))
43+
def write_data_frame_to_table(self, data_frame):
44+
qualified_target_table = "{0}.{1}".format(self.target_schema, self.target_table)
45+
self.logger.debug("Starting write to table {0}".format(qualified_target_table))
4546
data = StringIO()
4647
data_frame.to_csv(data, header=False, index=False, na_rep='')
4748
data.seek(0)
48-
raw = target_engine.raw_connection()
49+
raw = self.target_engine.raw_connection()
4950
curs = raw.cursor()
5051

51-
#TODO: This is assuming that our destination schema column order matches the columns in the dataframe. This
52-
#isn't always correct (especially in csv sources) - therefore, we should derive the column_array from the
53-
#data frames' columns.
54-
column_array = list(map(lambda cfg: cfg['destination']['name'], self.columns))
52+
column_array = list(map(lambda source_colum_name: self.get_destination_column_name(source_colum_name), data_frame.columns))
53+
column_list = ','.join(map(str, column_array))
5554

56-
curs.copy_from(data, destination_table, sep=',', columns=column_array, null='')
57-
self.logger.debug("Completed write to table {0}".format(destination_table))
55+
sql = "COPY {0}({1}) FROM STDIN with csv".format(qualified_target_table, column_list)
56+
self.logger.debug("Writing to table using command {0}".format(sql))
57+
curs.copy_expert(sql=sql, file=data)
58+
59+
self.logger.debug("Completed write to table {0}".format(qualified_target_table))
5860

5961
curs.connection.commit()
6062
return
6163

64+
def get_destination_column_name(self, source_column_name):
65+
for column in self.columns:
66+
if column['source_name'] == source_column_name:
67+
return column['destination']['name']
68+
69+
message = 'A source column with name {0} was not found in the column configuration'.format(source_column_name)
70+
raise ValueError(message)
71+
6272
def attach_column_transformers(self, data_frame):
6373
self.logger.debug("Attaching column transformers")
6474
for column in self.columns:

relational_data_loader_project/CsvDataSource.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

relational_data_loader_project/DataLoadManager.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,22 @@ def start_single_import(self, target_engine, configuration_name, full_load):
2525

2626
self.logger.debug("Execute Starting")
2727

28-
destination_table_manager = DestinationTableManager()
28+
destination_table_manager = DestinationTableManager(target_engine)
2929

30-
columns = self.data_source.get_valid_columns(pipeline_configuration['source_table'], pipeline_configuration['columns'])
30+
columns = self.data_source.get_valid_columns(pipeline_configuration['source_table'],
31+
pipeline_configuration['columns'])
3132

33+
destination_table_manager.create_schema(pipeline_configuration['target_schema'])
3234
if full_load:
3335
self.logger.info("Full-load is set. Recreating the staging table.")
34-
destination_table_manager.create_table(pipeline_configuration['stage_table'],
35-
columns, target_engine, drop_first=True)
36+
destination_table_manager.create_table(pipeline_configuration['target_schema'],
37+
pipeline_configuration['stage_table'],
38+
columns, drop_first=True)
3639

3740
# Import the data.
38-
batch_importer = BatchDataLoader(self.data_source,
41+
batch_data_loader = BatchDataLoader(self.data_source,
3942
pipeline_configuration['source_table'],
43+
pipeline_configuration['target_schema'],
4044
pipeline_configuration['stage_table'],
4145
columns,
4246
data_load_tracker,
@@ -45,17 +49,18 @@ def start_single_import(self, target_engine, configuration_name, full_load):
4549

4650
previous_unique_column_value = 0
4751
while previous_unique_column_value > -1:
48-
previous_unique_column_value = batch_importer.import_batch(previous_unique_column_value)
52+
previous_unique_column_value = batch_data_loader.load_batch(previous_unique_column_value)
4953

5054
self.logger.info("ImportBatch Completed")
5155

52-
#if full_load:
56+
if full_load:
5357
#return
5458
# Rename the stage table to the load table.
55-
# log.information("Full-load is set. Renaming the stage table to the load table.")
56-
# rename_table(pipeline_configuration['stage_source_data'], pipeline_configuration['load_source_data'])
59+
self.logger.info("Full-load is set. Renaming the stage table to the load table.")
60+
destination_table_manager.rename_table(pipeline_configuration['target_schema'],
61+
pipeline_configuration['stage_table'],
62+
pipeline_configuration['load_table'])
5763
#else:
58-
#return
5964
# upsert_data_from_stage_to_load_tables(pipeline_configuration['stage_source_data'], pipeline_configuration['load_source_data'])
6065

6166
data_load_tracker.completed_successfully()

0 commit comments

Comments
 (0)