Skip to content
This repository was archived by the owner on Mar 13, 2020. It is now read-only.

Commit 4803c7a

Browse files
authored
Merge pull request #21 from PageUpPeopleOrg/bug-fix/escape-cr-lf
OSC-1241 ensure carriage returns are handled properly
2 parents 9b8b78b + f8475e2 commit 4803c7a

File tree

2 files changed

+42
-2
lines changed

2 files changed

+42
-2
lines changed

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,29 @@ Use the following vscode settings by either:
8181

8282
### Testing
8383

84+
### Postgres debugging
85+
86+
Ensure the database you are using is in utf8 mode. You cannot change encoding once the database is created.
87+
88+
```sql
89+
90+
CREATE DATABASE "my_database"
91+
WITH OWNER "postgres"
92+
ENCODING 'UTF8'
93+
TEMPLATE template0;
94+
95+
```
96+
97+
Also ensure that the database has the CITEXT extension by logging into the DB and adding it
98+
99+
```sql
100+
101+
>>>psql my_database
102+
103+
CREATE EXTENSION CITEXT;
104+
105+
```
106+
84107
#### Integration
85108

86109
The test batch files assume there is a user by the name of `postgres` on the system.

modules/BatchDataLoader.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import logging
2+
import csv
3+
24
from io import StringIO
35
from modules.column_transformers.StringTransformers import ToUpper
46
from modules.shared import Constants
@@ -51,18 +53,33 @@ def write_data_frame_to_table(self, data_frame):
5153
qualified_target_table = f'{self.target_schema}.{self.target_table}'
5254
self.logger.debug(f"Starting write to table '{qualified_target_table}'")
5355
data = StringIO()
54-
data_frame.to_csv(data, header=False, index=False, na_rep='', float_format='%.16g')
56+
# quoting: Due to \r existing in strings in MSSQL we must quote anything that's non numeric just to be safe
57+
# line_terminator: ensure \n is used even on windows machines as prod runs on *nix with \n
58+
# na_rep: Due to us quoting everything non-numeric, our null's must be represented by something special, as the
59+
# default null representation (nothing), once quoted, is equivalent to an empty string
60+
data_frame.to_csv(data, header=False, index=False, na_rep='\\N', float_format='%.16g',
61+
quotechar='"', quoting=csv.QUOTE_NONNUMERIC, line_terminator='\n')
5562
# Float_format is used to truncate any insignificant digits. Unfortunately it gives us an artificial limitation
5663

5764
data.seek(0)
5865
raw = self.target_db.raw_connection()
5966
curs = raw.cursor()
6067

68+
# log CSV on debug
69+
if self.logger.getEffectiveLevel() == logging.DEBUG:
70+
with open(f'{qualified_target_table}.csv', 'w', encoding='utf-8') as f:
71+
f.write(data.getvalue())
72+
6173
column_array = list(
6274
map(lambda source_colum_name: self.get_destination_column_name(source_colum_name), data_frame.columns))
6375
column_list = ','.join(map(str, column_array))
6476

65-
sql = f"COPY {qualified_target_table}({column_list}) FROM STDIN with csv"
77+
# FORCE_NULL: ensure quoted fields are checked for NULLs as by default they are assumed to be non-null
78+
# specify null as \N so that psql doesn't assume empty strings are nulls
79+
sql = f"COPY {qualified_target_table}({column_list}) FROM STDIN "\
80+
f"with (format csv, "\
81+
f"null '\\N', "\
82+
f"FORCE_NULL ({column_list}))"
6683
self.logger.debug(f"Writing to table using command '{sql}'")
6784

6885
curs.copy_expert(sql=sql, file=data)

0 commit comments

Comments
 (0)