From 225de1f1700a011e225fc42b830b6e82cd9792a2 Mon Sep 17 00:00:00 2001 From: AndreasG Date: Tue, 19 Jan 2021 14:02:35 +0200 Subject: [PATCH 1/7] Add additional flag to ignore tables in data comparison --- pgdatadiff/main.py | 6 ++++-- pgdatadiff/pgdatadiff.py | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pgdatadiff/main.py b/pgdatadiff/main.py index db41464..eff46a2 100644 --- a/pgdatadiff/main.py +++ b/pgdatadiff/main.py @@ -1,6 +1,6 @@ """ Usage: - pgdatadiff --firstdb= --seconddb= [--only-data|--only-sequences] [--count-only] [--chunk-size=] + pgdatadiff --firstdb= --seconddb= [--only-data|--only-sequences] [--count-only] [--chunk-size=] [--exclude-tables=] pgdatadiff --version Options: @@ -10,6 +10,7 @@ --seconddb=postgres://postgres:password@localhost/seconddb The connection string of the second DB --only-data Only compare data, exclude sequences --only-sequences Only compare seqences, exclude data + --exclude-tables="" Exclude tables from data comparison Must be a comma separated string [default: empty string] --count-only Do a quick test based on counts alone --chunk-size=10000 The chunk size when comparing data [default: 10000] """ @@ -33,7 +34,8 @@ def main(): differ = DBDiff(first_db_connection_string, second_db_connection_string, chunk_size=arguments['--chunk-size'], - count_only=arguments['--count-only']) + count_only=arguments['--count-only'], + exclude_tables=arguments['--exclude-tables']) if not arguments['--only-sequences']: if differ.diff_all_table_data(): diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index 1bb9be1..cfaff37 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -1,6 +1,6 @@ import warnings -from fabulous.color import bold, green, red +from fabulous.color import bold, green, red, yellow from halo import Halo from sqlalchemy import exc as sa_exc from sqlalchemy.engine import create_engine @@ -19,7 +19,7 @@ def make_session(connection_string): class DBDiff(object): - def __init__(self, firstdb, seconddb, chunk_size=10000, count_only=False): + def __init__(self, firstdb, seconddb, chunk_size=10000, count_only=False, exclude_tables=""): firstsession, firstengine = make_session(firstdb) secondsession, secondengine = make_session(seconddb) self.firstsession = firstsession @@ -32,6 +32,7 @@ def __init__(self, firstdb, seconddb, chunk_size=10000, count_only=False): self.secondinspector = inspect(secondengine) self.chunk_size = int(chunk_size) self.count_only = count_only + self.exclude_tables = exclude_tables.split(',') def diff_table_data(self, tablename): try: @@ -142,6 +143,9 @@ def diff_all_table_data(self): tables = sorted( self.firstinspector.get_table_names(schema="public")) for table in tables: + if table in self.exclude_tables: + print(bold(yellow(f"Ignoring table {table}"))) + continue with Halo( text=f"Analysing table {table}. " f"[{tables.index(table) + 1}/{len(tables)}]", From 43fbf168ae17436fa5616510438e02b7b235f2d0 Mon Sep 17 00:00:00 2001 From: AndreasG Date: Tue, 9 Feb 2021 13:32:46 +0200 Subject: [PATCH 2/7] Add argument to specify schema name --- pgdatadiff/main.py | 6 ++++-- pgdatadiff/pgdatadiff.py | 9 +++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pgdatadiff/main.py b/pgdatadiff/main.py index eff46a2..c111f23 100644 --- a/pgdatadiff/main.py +++ b/pgdatadiff/main.py @@ -1,6 +1,6 @@ """ Usage: - pgdatadiff --firstdb= --seconddb= [--only-data|--only-sequences] [--count-only] [--chunk-size=] [--exclude-tables=] + pgdatadiff --firstdb= --seconddb= [--schema=] [--only-data|--only-sequences] [--count-only] [--chunk-size=] [--exclude-tables=] pgdatadiff --version Options: @@ -8,6 +8,7 @@ --version Show version. --firstdb=postgres://postgres:password@localhost/firstdb The connection string of the first DB --seconddb=postgres://postgres:password@localhost/seconddb The connection string of the second DB + --schema="public" The schema of tables in comparison --only-data Only compare data, exclude sequences --only-sequences Only compare seqences, exclude data --exclude-tables="" Exclude tables from data comparison Must be a comma separated string [default: empty string] @@ -35,7 +36,8 @@ def main(): differ = DBDiff(first_db_connection_string, second_db_connection_string, chunk_size=arguments['--chunk-size'], count_only=arguments['--count-only'], - exclude_tables=arguments['--exclude-tables']) + exclude_tables=arguments['--exclude-tables'], + schema=arguments['--schema']) if not arguments['--only-sequences']: if differ.diff_all_table_data(): diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index cfaff37..6f59f13 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -19,7 +19,7 @@ def make_session(connection_string): class DBDiff(object): - def __init__(self, firstdb, seconddb, chunk_size=10000, count_only=False, exclude_tables=""): + def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False, exclude_tables=""): firstsession, firstengine = make_session(firstdb) secondsession, secondengine = make_session(seconddb) self.firstsession = firstsession @@ -33,6 +33,7 @@ def __init__(self, firstdb, seconddb, chunk_size=10000, count_only=False, exclud self.chunk_size = int(chunk_size) self.count_only = count_only self.exclude_tables = exclude_tables.split(',') + self.schema = schema or 'public' def diff_table_data(self, tablename): try: @@ -62,7 +63,7 @@ def diff_table_data(self, tablename): SELECT md5(array_agg(md5((t.*)::varchar))::varchar) FROM ( SELECT * - FROM {tablename} + FROM {self.schema}.{tablename} ORDER BY {pk} limit :row_limit offset :row_offset ) AS t; """ @@ -91,7 +92,7 @@ def get_all_sequences(self): self.firstsession.execute(GET_SEQUENCES_SQL).fetchall()] def diff_sequence(self, seq_name): - GET_SEQUENCES_VALUE_SQL = f"SELECT last_value FROM {seq_name};" + GET_SEQUENCES_VALUE_SQL = f"SELECT last_value FROM {self.schema}.{seq_name};" try: firstvalue = \ @@ -141,7 +142,7 @@ def diff_all_table_data(self): with warnings.catch_warnings(): warnings.simplefilter("ignore", category=sa_exc.SAWarning) tables = sorted( - self.firstinspector.get_table_names(schema="public")) + self.firstinspector.get_table_names(schema=self.schema)) for table in tables: if table in self.exclude_tables: print(bold(yellow(f"Ignoring table {table}"))) From aaf46343149b531f284e6ef8d7aad76c78955c3a Mon Sep 17 00:00:00 2001 From: Ishan Kanade Date: Mon, 24 May 2021 19:49:38 +0530 Subject: [PATCH 3/7] Raise error if input schema name does not exist in db --- pgdatadiff/pgdatadiff.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index 6f59f13..895b895 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -33,7 +33,11 @@ def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False self.chunk_size = int(chunk_size) self.count_only = count_only self.exclude_tables = exclude_tables.split(',') + self.schema_names = self.firstinspector.get_schema_names() self.schema = schema or 'public' + if self.schema not in self.schema_names: + raise ValueError("Schema not found, check if argument has valid schema name") + print(f"Comparing for schema {self.schema}") def diff_table_data(self, tablename): try: @@ -143,6 +147,9 @@ def diff_all_table_data(self): warnings.simplefilter("ignore", category=sa_exc.SAWarning) tables = sorted( self.firstinspector.get_table_names(schema=self.schema)) + if len(tables) == 0: + print(bold(red(f'No tables found in schema: {self.schema}'))) + return 0 for table in tables: if table in self.exclude_tables: print(bold(yellow(f"Ignoring table {table}"))) From 773ae32c48c43e85955efa3ae09143202a80c1c6 Mon Sep 17 00:00:00 2001 From: Ishan Kanade Date: Mon, 24 May 2021 21:59:29 +0530 Subject: [PATCH 4/7] Compare sequences from same schema --- pgdatadiff/pgdatadiff.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index 895b895..151cb6e 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -64,7 +64,7 @@ def diff_table_data(self, tablename): return False, "table is missing" SQL_TEMPLATE_HASH = f""" - SELECT md5(array_agg(md5((t.*)::varchar))::varchar) + SELECT md5(array_agg(md5((t.*)::text))::text) FROM ( SELECT * FROM {self.schema}.{tablename} @@ -84,14 +84,13 @@ def diff_table_data(self, tablename): {"row_limit": self.chunk_size, "row_offset": position}).fetchone() if firstresult != secondresult: - return False, f"data is different - position {position} -" \ + return False, f"data is different - for rows from {position} - to" \ f" {position + self.chunk_size}" position += self.chunk_size return True, "data is identical." def get_all_sequences(self): - GET_SEQUENCES_SQL = """SELECT c.relname FROM - pg_class c WHERE c.relkind = 'S';""" + GET_SEQUENCES_SQL = f"""SELECT sequence_name FROM information_schema.sequences WHERE sequence_schema = {self.schema};""" return [x[0] for x in self.firstsession.execute(GET_SEQUENCES_SQL).fetchall()] @@ -119,7 +118,7 @@ def diff_sequence(self, seq_name): return True, f"sequences are identical- ({firstvalue})." def diff_all_sequences(self): - print(bold(red('Starting sequence analysis.'))) + print(bold(red(f'Starting sequence analysis for schema -> {self.schema}'))) sequences = sorted(self.get_all_sequences()) failures = 0 for sequence in sequences: From 35ecb6b212b50d23f25198ef924872691a9294ff Mon Sep 17 00:00:00 2001 From: Ishan Kanade Date: Thu, 27 May 2021 21:17:53 +0530 Subject: [PATCH 5/7] Use MAX where primary key is a sequence, use COUNT otherwise. --- pgdatadiff/main.py | 4 +++- pgdatadiff/pgdatadiff.py | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pgdatadiff/main.py b/pgdatadiff/main.py index c111f23..c5fe9f8 100644 --- a/pgdatadiff/main.py +++ b/pgdatadiff/main.py @@ -1,6 +1,6 @@ """ Usage: - pgdatadiff --firstdb= --seconddb= [--schema=] [--only-data|--only-sequences] [--count-only] [--chunk-size=] [--exclude-tables=] + pgdatadiff --firstdb= --seconddb= [--schema=] [--only-data|--only-sequences] [--count-only] [--count-with-max] [--chunk-size=] [--exclude-tables=] pgdatadiff --version Options: @@ -14,6 +14,7 @@ --exclude-tables="" Exclude tables from data comparison Must be a comma separated string [default: empty string] --count-only Do a quick test based on counts alone --chunk-size=10000 The chunk size when comparing data [default: 10000] + --count-with-max Use MAX(id) when a table uses a sequence, otherwise use COUNT. """ import pkg_resources @@ -36,6 +37,7 @@ def main(): differ = DBDiff(first_db_connection_string, second_db_connection_string, chunk_size=arguments['--chunk-size'], count_only=arguments['--count-only'], + count_with_max=arguments['--count-with-max'], exclude_tables=arguments['--exclude-tables'], schema=arguments['--schema']) diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index 151cb6e..36497c1 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -19,7 +19,7 @@ def make_session(connection_string): class DBDiff(object): - def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False, exclude_tables=""): + def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False, count_with_max=False, exclude_tables=""): firstsession, firstengine = make_session(firstdb) secondsession, secondengine = make_session(seconddb) self.firstsession = firstsession @@ -32,6 +32,7 @@ def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False self.secondinspector = inspect(secondengine) self.chunk_size = int(chunk_size) self.count_only = count_only + self.count_with_max = count_with_max self.exclude_tables = exclude_tables.split(',') self.schema_names = self.firstinspector.get_schema_names() self.schema = schema or 'public' @@ -47,12 +48,25 @@ def diff_table_data(self, tablename): secondtable = Table(tablename, self.secondmeta, autoload=True) secondquery = self.secondsession.query( secondtable) + if self.count_with_max is True: + column = self.column_using_sequence(tablename) + pk_columns = self.firstinspector.get_pk_constraint(tablename)['constrained_columns'] + if column is not None and column in pk_columns: + GET_MAX_SQL = f"SELECT MAX({column}) FROM {tablename}" + first_max_count = self.firstsession.execute(GET_MAX_SQL).fetchone()[0] + second_max_count = self.secondsession.execute(GET_MAX_SQL).fetchone()[0] + if first_max_count != second_max_count: + return False, f"MAX value are different" \ + f" {first_max_count} != {second_max_count}" + if first_max_count == 0: + return None, "using MAX value, tables are empty because MAX on first db is zero" + return True, "MAX Value is same for both tables" if firstquery.count() != secondquery.count(): return False, f"counts are different" \ f" {firstquery.count()} != {secondquery.count()}" if firstquery.count() == 0: return None, "tables are empty" - if self.count_only is True: + if self.count_only is True or self.count_with_max is True: return True, "Counts are the same" pk = ",".join(self.firstinspector.get_pk_constraint(tablename)[ 'constrained_columns']) @@ -89,8 +103,25 @@ def diff_table_data(self, tablename): position += self.chunk_size return True, "data is identical." + def column_using_sequence(self, tablename): + GET_COLUMN_OF_TABLES_WITH_SEQUENCES = f"""SELECT + attrib.attname AS column_name + FROM pg_class AS seqclass + JOIN pg_depend AS dep + ON ( seqclass.relfilenode = dep.objid ) + JOIN pg_class AS depclass + ON ( dep.refobjid = depclass.relfilenode ) + JOIN pg_attribute AS attrib + ON ( attrib.attnum = dep.refobjsubid + AND attrib.attrelid = dep.refobjid ) + WHERE seqclass.relkind = 'S' AND depclass.relname = '{tablename}';""" + response = self.firstsession.execute(GET_COLUMN_OF_TABLES_WITH_SEQUENCES).fetchone() + if response is None: + return None + return response[0] + def get_all_sequences(self): - GET_SEQUENCES_SQL = f"""SELECT sequence_name FROM information_schema.sequences WHERE sequence_schema = {self.schema};""" + GET_SEQUENCES_SQL = f"""SELECT sequence_name FROM information_schema.sequences WHERE sequence_schema = '{self.schema}';""" return [x[0] for x in self.firstsession.execute(GET_SEQUENCES_SQL).fetchall()] From 786873f1885dd6bcc7b0819efe3881569cc98f8c Mon Sep 17 00:00:00 2001 From: Ishan Kanade Date: Mon, 31 May 2021 13:52:46 +0530 Subject: [PATCH 6/7] Fix counts being same but diff --- pgdatadiff/pgdatadiff.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index 36497c1..8f16db7 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -61,10 +61,12 @@ def diff_table_data(self, tablename): if first_max_count == 0: return None, "using MAX value, tables are empty because MAX on first db is zero" return True, "MAX Value is same for both tables" - if firstquery.count() != secondquery.count(): + first_count = firstquery.count() + second_count = secondquery.count() + if first_count != second_count: return False, f"counts are different" \ - f" {firstquery.count()} != {secondquery.count()}" - if firstquery.count() == 0: + f" {first_count} != {second_count}" + if first_count == 0: return None, "tables are empty" if self.count_only is True or self.count_with_max is True: return True, "Counts are the same" From 725ec207748bada6c8f21588364df5dd9d9b4d6a Mon Sep 17 00:00:00 2001 From: Daniel Quimper Date: Mon, 17 Apr 2023 15:59:08 -0400 Subject: [PATCH 7/7] Add include-tables option and progress status. --- README.md | 4 +++- pgdatadiff/main.py | 6 ++++-- pgdatadiff/pgdatadiff.py | 35 +++++++++++++++++++++++++---------- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index aaad602..0c15e84 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,8 @@ Check `pgdatadiff --help` Docker images are available. -`docker run -it davidjmarkey/pgdatadiff:0.2.1 /usr/bin/pgdatadiff` +``` +docker run -it davidjmarkey/pgdatadiff:0.2.1 /usr/bin/pgdatadiff --help +``` diff --git a/pgdatadiff/main.py b/pgdatadiff/main.py index c5fe9f8..d2361ee 100644 --- a/pgdatadiff/main.py +++ b/pgdatadiff/main.py @@ -1,6 +1,6 @@ """ Usage: - pgdatadiff --firstdb= --seconddb= [--schema=] [--only-data|--only-sequences] [--count-only] [--count-with-max] [--chunk-size=] [--exclude-tables=] + pgdatadiff --firstdb= --seconddb= [--schema=] [--only-data|--only-sequences] [--count-only] [--count-with-max] [--chunk-size=] [--exclude-tables=] [--include-tables=] pgdatadiff --version Options: @@ -11,7 +11,8 @@ --schema="public" The schema of tables in comparison --only-data Only compare data, exclude sequences --only-sequences Only compare seqences, exclude data - --exclude-tables="" Exclude tables from data comparison Must be a comma separated string [default: empty string] + --exclude-tables="" Exclude tables from data comparison Must be a comma separated string + --include-tables="" Only include tables in data comparison Must be a comma separated string --count-only Do a quick test based on counts alone --chunk-size=10000 The chunk size when comparing data [default: 10000] --count-with-max Use MAX(id) when a table uses a sequence, otherwise use COUNT. @@ -39,6 +40,7 @@ def main(): count_only=arguments['--count-only'], count_with_max=arguments['--count-with-max'], exclude_tables=arguments['--exclude-tables'], + include_tables=arguments['--include-tables'], schema=arguments['--schema']) if not arguments['--only-sequences']: diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index 8f16db7..da6b177 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -19,7 +19,7 @@ def make_session(connection_string): class DBDiff(object): - def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False, count_with_max=False, exclude_tables=""): + def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False, count_with_max=False, progress=True, exclude_tables="", include_tables=""): firstsession, firstengine = make_session(firstdb) secondsession, secondengine = make_session(seconddb) self.firstsession = firstsession @@ -33,7 +33,15 @@ def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False self.chunk_size = int(chunk_size) self.count_only = count_only self.count_with_max = count_with_max - self.exclude_tables = exclude_tables.split(',') + self.progress = progress + if exclude_tables is None: + self.exclude_tables = [] + else: + self.exclude_tables = exclude_tables.split(',') + if include_tables is None: + self.include_tables = [] + else: + self.include_tables = (include_tables or "").split(',') self.schema_names = self.firstinspector.get_schema_names() self.schema = schema or 'public' if self.schema not in self.schema_names: @@ -43,11 +51,9 @@ def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False def diff_table_data(self, tablename): try: firsttable = Table(tablename, self.firstmeta, autoload=True) - firstquery = self.firstsession.query( - firsttable) + firstquery = self.firstsession.query(firsttable) secondtable = Table(tablename, self.secondmeta, autoload=True) - secondquery = self.secondsession.query( - secondtable) + secondquery = self.secondsession.query(secondtable) if self.count_with_max is True: column = self.column_using_sequence(tablename) pk_columns = self.firstinspector.get_pk_constraint(tablename)['constrained_columns'] @@ -90,7 +96,7 @@ def diff_table_data(self, tablename): position = 0 - while position <= firstquery.count(): + while position <= first_count: firstresult = self.firstsession.execute( SQL_TEMPLATE_HASH, {"row_limit": self.chunk_size, @@ -103,8 +109,15 @@ def diff_table_data(self, tablename): return False, f"data is different - for rows from {position} - to" \ f" {position + self.chunk_size}" position += self.chunk_size + self.display_progress(position, first_count) return True, "data is identical." + def display_progress(self, position, first_count): + if position > first_count: + position = first_count + if first_count > self.chunk_size and self.progress is True: + print(f' Progress: {"{:2.1f}".format(position/first_count*100)}%') + def column_using_sequence(self, tablename): GET_COLUMN_OF_TABLES_WITH_SEQUENCES = f"""SELECT attrib.attname AS column_name @@ -177,14 +190,16 @@ def diff_all_table_data(self): print(bold(red('Starting table analysis.'))) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=sa_exc.SAWarning) - tables = sorted( - self.firstinspector.get_table_names(schema=self.schema)) + tables = sorted(self.firstinspector.get_table_names(schema=self.schema)) + if len(self.include_tables) > 0: + # Intersection of 2 array + tables = [value for value in tables if value in self.include_tables] if len(tables) == 0: print(bold(red(f'No tables found in schema: {self.schema}'))) return 0 for table in tables: if table in self.exclude_tables: - print(bold(yellow(f"Ignoring table {table}"))) + print(bold(yellow(f"Ignoring table {table} (excluded)"))) continue with Halo( text=f"Analysing table {table}. "