Skip to content
This repository was archived by the owner on Mar 13, 2020. It is now read-only.

Commit ab50132

Browse files
authored
[OSC-1277] prevent logging PI (#28)
* [OSC-1277] prevent logging PI * add unit tests for PI leak * update unit tests to echo properly * ensure failure 1 works * ensure failure 2 works * log sensitive error data to db
1 parent 6e41346 commit ab50132

16 files changed

+103
-8
lines changed

appveyor.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ test_script:
6666

6767
- test_full_refresh_from_mssql.cmd
6868
- test_audit.cmd
69+
- test_pi_leak.cmd
6970
- test_mssql_failover_server.cmd
7071

7172
on_finish:
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"source_table": {
3+
"name": "CompoundPk",
4+
"schema": "dbo",
5+
"primary_keys": ["Id1"]
6+
},
7+
"target_schema": "rdl_integration_tests",
8+
"stage_table": "stage_compound_pk",
9+
"load_table": "load_compound_pk",
10+
11+
"batch": {
12+
"size": 100000
13+
},
14+
"columns": [
15+
{
16+
"source_name": "Id1",
17+
"destination": {
18+
"name": "id_1",
19+
"type": "int",
20+
"nullable": false,
21+
"primary_key": true
22+
}
23+
},
24+
{
25+
"source_name": "Id2",
26+
"destination": {
27+
"name": "id_2",
28+
"type": "int",
29+
"nullable": true
30+
}
31+
}
32+
]
33+
}

rdl/BatchDataLoader.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from io import StringIO
55
from rdl.column_transformers.StringTransformers import ToUpper
66
from rdl.shared import Constants
7+
from rdl.shared.Utils import prevent_senstive_data_logging
78

89

910
class BatchDataLoader(object):
@@ -49,6 +50,7 @@ def load_batch(self, batch_key_tracker):
4950

5051
self.logger.info(f"Batch keys '{batch_key_tracker.bookmarks}' completed. {batch_tracker.get_statistics()}")
5152

53+
@prevent_senstive_data_logging
5254
def write_data_frame_to_table(self, data_frame):
5355
qualified_target_table = f'{self.target_schema}.{self.target_table}'
5456
self.logger.debug(f"Starting write to table '{qualified_target_table}'")

rdl/DataLoadManager.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from rdl.data_load_tracking.DataLoadTracker import DataLoadTracker
1111
from rdl.BatchKeyTracker import BatchKeyTracker
1212
from rdl.shared import Constants
13+
from rdl.shared.Utils import SensitiveDataError
1314

1415

1516
class DataLoadManager(object):
@@ -137,7 +138,12 @@ def start_single_import(self, model_file, requested_full_refresh, model_number,
137138

138139
batch_key_tracker = BatchKeyTracker(model_config['source_table']['primary_keys'])
139140
while batch_key_tracker.has_more_data:
140-
batch_data_loader.load_batch(batch_key_tracker)
141+
try:
142+
batch_data_loader.load_batch(batch_key_tracker)
143+
except SensitiveDataError as e:
144+
data_load_tracker.data_load_failed(e.sensitive_error_args)
145+
self.data_load_tracker_repository.save(data_load_tracker)
146+
raise e
141147

142148
if full_refresh:
143149
# Rename the stage table to the load table.
@@ -154,7 +160,7 @@ def start_single_import(self, model_file, requested_full_refresh, model_number,
154160

155161
destination_table_manager.drop_table(model_config['target_schema'],
156162
model_config['stage_table'])
157-
data_load_tracker.completed_successfully()
163+
data_load_tracker.data_load_successful()
158164
self.data_load_tracker_repository.save(data_load_tracker)
159165
self.logger.info(f"{model_number:0{max_model_number_len}d} of {total_number_of_models}"
160166
f" COMPLETED {model_name},"

rdl/data_load_tracking/DataLoadExecution.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ class DataLoadExecution(Base):
2222
execution_time_ms = Column(Integer, nullable=False)
2323
rows_processed = Column(Integer, nullable=False)
2424
model_checksum = Column(String(100), nullable=False)
25+
failure_reason = Column(String(1000), nullable=True)

rdl/data_load_tracking/DataLoadTracker.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,26 @@ def __init__(
3434
self.change_tracking_info = change_tracking_info
3535
self.correlation_id = correlation_id
3636
self.full_refresh_reason = full_refresh_reason
37+
self.failure_reason = None
3738

3839
def start_batch(self):
3940
batch = self.Batch()
4041
self.batches.append(batch)
4142
return batch
4243

43-
def completed_successfully(self):
44+
def data_load_successful(self):
45+
self.data_load_completed(Constants.ExecutionStatus.COMPLETED_SUCCESSFULLY)
46+
47+
def data_load_failed(self, failure_reason=None):
48+
self.data_load_completed(Constants.ExecutionStatus.FAILED, failure_reason)
49+
50+
def data_load_completed(self, execution_status, failure_reason=None):
4451
self.completed = datetime.now()
4552
self.total_execution_time = self.completed - self.started
46-
self.status = Constants.ExecutionStatus.COMPLETED_SUCCESSFULLY
53+
self.status = execution_status
54+
self.failure_reason = failure_reason
4755
for batch in self.batches:
4856
self.total_row_count += batch.row_count
49-
5057
self.rows_per_second = self.total_row_count / (self.total_execution_time.total_seconds() + 1e-10)
5158

5259
def get_statistics(self):

rdl/data_load_tracking/DataLoadTrackerRepository.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ def save(self, data_load_tracker):
3636
full_refresh_reason=data_load_tracker.full_refresh_reason,
3737
execution_time_ms=int(data_load_tracker.total_execution_time.total_seconds() * 1000),
3838
rows_processed=data_load_tracker.total_row_count,
39-
model_checksum=data_load_tracker.model_checksum)
39+
model_checksum=data_load_tracker.model_checksum,
40+
failure_reason=data_load_tracker.failure_reason)
4041

4142
session = self.session_maker()
4243
session.add(data_load_execution)

rdl/data_sources/CsvDataSource.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import logging
2-
import pandas
32
import os.path
4-
from rdl.ColumnTypeResolver import ColumnTypeResolver
53
from pathlib import Path
4+
import pandas
5+
6+
from rdl.ColumnTypeResolver import ColumnTypeResolver
67
from rdl.data_sources.ChangeTrackingInfo import ChangeTrackingInfo
8+
from rdl.shared.Utils import prevent_senstive_data_logging
79

810

911
class CsvDataSource(object):
@@ -42,6 +44,7 @@ def assert_column_exists(self, column_name, data_frame, csv_file):
4244

4345
# For now, the CSV data sources will get all rows in the CSV regardless of
4446
# batch size. - Ie, they don't currently support paging.
47+
@prevent_senstive_data_logging
4548
def get_next_data_frame(
4649
self,
4750
table_config,

rdl/data_sources/MsSqlDataSource.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from rdl.ColumnTypeResolver import ColumnTypeResolver
1414
from rdl.data_sources.ChangeTrackingInfo import ChangeTrackingInfo
1515
from rdl.shared import Constants
16+
from rdl.shared.Utils import prevent_senstive_data_logging
1617

1718

1819
class MsSqlDataSource(object):
@@ -134,6 +135,7 @@ def get_table_columns(self, table_config):
134135
autoload_with=self.database_engine)
135136
return list(map(lambda column: column.name, table.columns))
136137

138+
@prevent_senstive_data_logging
137139
def get_next_data_frame(self, table_config, columns, batch_config, batch_tracker, batch_key_tracker,
138140
full_refresh, change_tracking_info):
139141
sql = self.build_select_statement(table_config, columns, batch_config, batch_key_tracker,

rdl/shared/Constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class FullRefreshReason:
1414

1515
class ExecutionStatus:
1616
NOT_STARTED = 'Not Started'
17+
FAILED = 'Failed'
1718
EXTRACT_COMPLETED_SUCCESSFULLY = 'Extract Completed Successfully'
1819
LOAD_COMPLETED_SUCCESSFULLY = 'Load Completed Successfully'
1920
SKIPPED_AS_ZERO_ROWS = 'Skipped - Zero Rows'

0 commit comments

Comments
 (0)