diff --git a/README.md b/README.md index aaad602..0c15e84 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,8 @@ Check `pgdatadiff --help` Docker images are available. -`docker run -it davidjmarkey/pgdatadiff:0.2.1 /usr/bin/pgdatadiff` +``` +docker run -it davidjmarkey/pgdatadiff:0.2.1 /usr/bin/pgdatadiff --help +``` diff --git a/pgdatadiff/main.py b/pgdatadiff/main.py index c5fe9f8..d2361ee 100644 --- a/pgdatadiff/main.py +++ b/pgdatadiff/main.py @@ -1,6 +1,6 @@ """ Usage: - pgdatadiff --firstdb= --seconddb= [--schema=] [--only-data|--only-sequences] [--count-only] [--count-with-max] [--chunk-size=] [--exclude-tables=] + pgdatadiff --firstdb= --seconddb= [--schema=] [--only-data|--only-sequences] [--count-only] [--count-with-max] [--chunk-size=] [--exclude-tables=] [--include-tables=] pgdatadiff --version Options: @@ -11,7 +11,8 @@ --schema="public" The schema of tables in comparison --only-data Only compare data, exclude sequences --only-sequences Only compare seqences, exclude data - --exclude-tables="" Exclude tables from data comparison Must be a comma separated string [default: empty string] + --exclude-tables="" Exclude tables from data comparison Must be a comma separated string + --include-tables="" Only include tables in data comparison Must be a comma separated string --count-only Do a quick test based on counts alone --chunk-size=10000 The chunk size when comparing data [default: 10000] --count-with-max Use MAX(id) when a table uses a sequence, otherwise use COUNT. @@ -39,6 +40,7 @@ def main(): count_only=arguments['--count-only'], count_with_max=arguments['--count-with-max'], exclude_tables=arguments['--exclude-tables'], + include_tables=arguments['--include-tables'], schema=arguments['--schema']) if not arguments['--only-sequences']: diff --git a/pgdatadiff/pgdatadiff.py b/pgdatadiff/pgdatadiff.py index 8f16db7..da6b177 100644 --- a/pgdatadiff/pgdatadiff.py +++ b/pgdatadiff/pgdatadiff.py @@ -19,7 +19,7 @@ def make_session(connection_string): class DBDiff(object): - def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False, count_with_max=False, exclude_tables=""): + def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False, count_with_max=False, progress=True, exclude_tables="", include_tables=""): firstsession, firstengine = make_session(firstdb) secondsession, secondengine = make_session(seconddb) self.firstsession = firstsession @@ -33,7 +33,15 @@ def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False self.chunk_size = int(chunk_size) self.count_only = count_only self.count_with_max = count_with_max - self.exclude_tables = exclude_tables.split(',') + self.progress = progress + if exclude_tables is None: + self.exclude_tables = [] + else: + self.exclude_tables = exclude_tables.split(',') + if include_tables is None: + self.include_tables = [] + else: + self.include_tables = (include_tables or "").split(',') self.schema_names = self.firstinspector.get_schema_names() self.schema = schema or 'public' if self.schema not in self.schema_names: @@ -43,11 +51,9 @@ def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False def diff_table_data(self, tablename): try: firsttable = Table(tablename, self.firstmeta, autoload=True) - firstquery = self.firstsession.query( - firsttable) + firstquery = self.firstsession.query(firsttable) secondtable = Table(tablename, self.secondmeta, autoload=True) - secondquery = self.secondsession.query( - secondtable) + secondquery = self.secondsession.query(secondtable) if self.count_with_max is True: column = self.column_using_sequence(tablename) pk_columns = self.firstinspector.get_pk_constraint(tablename)['constrained_columns'] @@ -90,7 +96,7 @@ def diff_table_data(self, tablename): position = 0 - while position <= firstquery.count(): + while position <= first_count: firstresult = self.firstsession.execute( SQL_TEMPLATE_HASH, {"row_limit": self.chunk_size, @@ -103,8 +109,15 @@ def diff_table_data(self, tablename): return False, f"data is different - for rows from {position} - to" \ f" {position + self.chunk_size}" position += self.chunk_size + self.display_progress(position, first_count) return True, "data is identical." + def display_progress(self, position, first_count): + if position > first_count: + position = first_count + if first_count > self.chunk_size and self.progress is True: + print(f' Progress: {"{:2.1f}".format(position/first_count*100)}%') + def column_using_sequence(self, tablename): GET_COLUMN_OF_TABLES_WITH_SEQUENCES = f"""SELECT attrib.attname AS column_name @@ -177,14 +190,16 @@ def diff_all_table_data(self): print(bold(red('Starting table analysis.'))) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=sa_exc.SAWarning) - tables = sorted( - self.firstinspector.get_table_names(schema=self.schema)) + tables = sorted(self.firstinspector.get_table_names(schema=self.schema)) + if len(self.include_tables) > 0: + # Intersection of 2 array + tables = [value for value in tables if value in self.include_tables] if len(tables) == 0: print(bold(red(f'No tables found in schema: {self.schema}'))) return 0 for table in tables: if table in self.exclude_tables: - print(bold(yellow(f"Ignoring table {table}"))) + print(bold(yellow(f"Ignoring table {table} (excluded)"))) continue with Halo( text=f"Analysing table {table}. "