From 055a243e952200ef5f249a59c81177393f5d6b79 Mon Sep 17 00:00:00 2001 From: Sean S Date: Fri, 29 Jan 2021 21:08:44 +0000 Subject: [PATCH 1/3] feat: Add schema selection and print 1st difference --- build.sh | 4 ++++ pgdatadiff/main.py | 11 ++++++++-- pgdatadiff/pgdatadiff.py | 43 +++++++++++++++++++++++++++++----------- setup.py | 2 +- 4 files changed, 45 insertions(+), 15 deletions(-) create mode 100755 build.sh diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..e4e0df3 --- /dev/null +++ b/build.sh @@ -0,0 +1,4 @@ +#!/bin/bash +rm -rf dist +rm -rf build +python setup.py build && python setup.py install diff --git a/pgdatadiff/main.py b/pgdatadiff/main.py index db41464..06dfea1 100644 --- a/pgdatadiff/main.py +++ b/pgdatadiff/main.py @@ -1,6 +1,6 @@ """ Usage: - pgdatadiff --firstdb= --seconddb= [--only-data|--only-sequences] [--count-only] [--chunk-size=] + pgdatadiff --firstdb= --seconddb= [--schema=] [--only-data|--only-sequences] [--count-only] [--chunk-size=] pgdatadiff --version Options: @@ -8,6 +8,7 @@ --version Show version. --firstdb=postgres://postgres:password@localhost/firstdb The connection string of the first DB --seconddb=postgres://postgres:password@localhost/seconddb The connection string of the second DB + --schema= Set the schema (by default, public) --only-data Only compare data, exclude sequences --only-sequences Only compare seqences, exclude data --count-only Do a quick test based on counts alone @@ -26,12 +27,18 @@ def main(): __doc__, version=pkg_resources.require("pgdatadiff")[0].version) first_db_connection_string=arguments['--firstdb'] second_db_connection_string=arguments['--seconddb'] + if not first_db_connection_string.startswith("postgres://") or \ not second_db_connection_string.startswith("postgres://"): print(red("Only Postgres DBs are supported")) return 1 - differ = DBDiff(first_db_connection_string, second_db_connection_string, + schema_name='public' + + if '--schema' in arguments: + schema_name=arguments['--schema'] + + differ = DBDiff(first_db_connection_string, second_db_connection_string, schema_name, chunk_size=arguments['--chunk-size'], count_only=arguments['--count-only']) diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index 1bb9be1..0866d80 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -19,7 +19,7 @@ def make_session(connection_string): class DBDiff(object): - def __init__(self, firstdb, seconddb, chunk_size=10000, count_only=False): + def __init__(self, firstdb, seconddb, schema_name, chunk_size=10000, count_only=False): firstsession, firstengine = make_session(firstdb) secondsession, secondengine = make_session(seconddb) self.firstsession = firstsession @@ -32,13 +32,14 @@ def __init__(self, firstdb, seconddb, chunk_size=10000, count_only=False): self.secondinspector = inspect(secondengine) self.chunk_size = int(chunk_size) self.count_only = count_only + self.schema_name = schema_name def diff_table_data(self, tablename): try: - firsttable = Table(tablename, self.firstmeta, autoload=True) + firsttable = Table(tablename, self.firstmeta, autoload=True, schema=self.schema_name) firstquery = self.firstsession.query( firsttable) - secondtable = Table(tablename, self.secondmeta, autoload=True) + secondtable = Table(tablename, self.secondmeta, autoload=True, schema=self.schema_name) secondquery = self.secondsession.query( secondtable) if firstquery.count() != secondquery.count(): @@ -48,8 +49,7 @@ def diff_table_data(self, tablename): return None, "tables are empty" if self.count_only is True: return True, "Counts are the same" - pk = ",".join(self.firstinspector.get_pk_constraint(tablename)[ - 'constrained_columns']) + pk = ",".join(['"' + x + '"' for x in self.firstinspector.get_pk_constraint(tablename)['constrained_columns']]) if not pk: return None, "no primary key(s) on this table." \ " Comparision is not possible." @@ -61,13 +61,18 @@ def diff_table_data(self, tablename): SELECT md5(array_agg(md5((t.*)::varchar))::varchar) FROM ( SELECT * - FROM {tablename} + FROM "{self.schema_name}"."{tablename}" ORDER BY {pk} limit :row_limit offset :row_offset ) AS t; """ - position = 0 + SQL_DIFFERENCE_BLOCK = f""" + SELECT {pk}, '[' || (t.*)::varchar || ']' + FROM "{self.schema_name}"."{tablename}" t + ORDER BY {pk} limit :row_limit offset :row_offset; + """ + position = 0 while position <= firstquery.count(): firstresult = self.firstsession.execute( SQL_TEMPLATE_HASH, @@ -78,19 +83,33 @@ def diff_table_data(self, tablename): {"row_limit": self.chunk_size, "row_offset": position}).fetchone() if firstresult != secondresult: + # OK - data is different - show the first rows which differ + firstdiff = self.firstsession.execute( + SQL_DIFFERENCE_BLOCK, + {"row_limit": self.chunk_size, + "row_offset": position}).fetchall() + seconddiff = self.secondsession.execute( + SQL_DIFFERENCE_BLOCK, + {"row_limit": self.chunk_size, + "row_offset": position}).fetchall() + index = position + for first, second in zip(firstdiff, seconddiff): + if first != second: + return False, f"data first differs at position: {index} - diff: [{first}] <> [{second}]" \ + f" - Comparison ends" + index += 1 return False, f"data is different - position {position} -" \ f" {position + self.chunk_size}" position += self.chunk_size return True, "data is identical." def get_all_sequences(self): - GET_SEQUENCES_SQL = """SELECT c.relname FROM - pg_class c WHERE c.relkind = 'S';""" + GET_SEQUENCES_SQL = f"""SELECT sequence_name FROM information_schema.sequences WHERE sequence_schema = '{self.schema_name}';""" return [x[0] for x in self.firstsession.execute(GET_SEQUENCES_SQL).fetchall()] def diff_sequence(self, seq_name): - GET_SEQUENCES_VALUE_SQL = f"SELECT last_value FROM {seq_name};" + GET_SEQUENCES_VALUE_SQL = f"SELECT last_value FROM \"{seq_name}\";" try: firstvalue = \ @@ -110,7 +129,7 @@ def diff_sequence(self, seq_name): if firstvalue > secondvalue: return False, f"first sequence is greater than" \ f" the second({firstvalue} vs {secondvalue})." - return True, f"sequences are identical- ({firstvalue})." + return True, f"sequences are identical - ({firstvalue})." def diff_all_sequences(self): print(bold(red('Starting sequence analysis.'))) @@ -140,7 +159,7 @@ def diff_all_table_data(self): with warnings.catch_warnings(): warnings.simplefilter("ignore", category=sa_exc.SAWarning) tables = sorted( - self.firstinspector.get_table_names(schema="public")) + self.firstinspector.get_table_names(schema=self.schema_name)) for table in tables: with Halo( text=f"Analysing table {table}. " diff --git a/setup.py b/setup.py index ec23877..3215e82 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ install_requires=[ 'SQLAlchemy<=1.3.11', 'halo<=0.0.28', - 'psycopg2<=2.8.4', + 'psycopg2-binary<=2.8.4', 'fabulous<=0.3.0', 'docopt<=0.6.2' ], From 3c2adfd80d2f92c90ada1a3327ab995683270ed7 Mon Sep 17 00:00:00 2001 From: Sean S Date: Wed, 3 Feb 2021 21:41:17 +0000 Subject: [PATCH 2/3] chore: Cleanup --- pgdatadiff/main.py | 8 +++++--- pgdatadiff/pgdatadiff.py | 12 +++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pgdatadiff/main.py b/pgdatadiff/main.py index 06dfea1..860c27f 100644 --- a/pgdatadiff/main.py +++ b/pgdatadiff/main.py @@ -33,10 +33,12 @@ def main(): print(red("Only Postgres DBs are supported")) return 1 - schema_name='public' + schema_name=arguments['--schema'] - if '--schema' in arguments: - schema_name=arguments['--schema'] + if not schema_name: + schema_name='public' + + print(f"Checking database using schema [{schema_name}]...") differ = DBDiff(first_db_connection_string, second_db_connection_string, schema_name, chunk_size=arguments['--chunk-size'], diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index 0866d80..72a4908 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -49,7 +49,7 @@ def diff_table_data(self, tablename): return None, "tables are empty" if self.count_only is True: return True, "Counts are the same" - pk = ",".join(['"' + x + '"' for x in self.firstinspector.get_pk_constraint(tablename)['constrained_columns']]) + pk = ",".join([f'"{x}"' for x in self.firstinspector.get_pk_constraint(tablename)['constrained_columns']]) if not pk: return None, "no primary key(s) on this table." \ " Comparision is not possible." @@ -67,7 +67,7 @@ def diff_table_data(self, tablename): """ SQL_DIFFERENCE_BLOCK = f""" - SELECT {pk}, '[' || (t.*)::varchar || ']' + SELECT (t.*)::varchar FROM "{self.schema_name}"."{tablename}" t ORDER BY {pk} limit :row_limit offset :row_offset; """ @@ -94,9 +94,11 @@ def diff_table_data(self, tablename): "row_offset": position}).fetchall() index = position for first, second in zip(firstdiff, seconddiff): - if first != second: - return False, f"data first differs at position: {index} - diff: [{first}] <> [{second}]" \ - f" - Comparison ends" + first_row = first[0] + second_row = second[0] + if first_row != second_row: + return False, f"data first differs at position: {index}\n1st: {first_row}\n2nd: {second_row}" \ + f"\nComparison ends.\n" index += 1 return False, f"data is different - position {position} -" \ f" {position + self.chunk_size}" From 73fa34421e75b701be64a935a36b72a0c4483c11 Mon Sep 17 00:00:00 2001 From: Sean S Date: Wed, 3 Feb 2021 21:43:29 +0000 Subject: [PATCH 3/3] chore: Remove build file --- build.sh | 4 ---- 1 file changed, 4 deletions(-) delete mode 100755 build.sh diff --git a/build.sh b/build.sh deleted file mode 100755 index e4e0df3..0000000 --- a/build.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -rm -rf dist -rm -rf build -python setup.py build && python setup.py install