Add include-tables option and progress status. #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

dquimper wants to merge 1 commit into ikanade:master from dquimper:dq/include-table-and-progress-status

README.md

-Original file line number
+Diff line change
@@ Expand Up / @@ -32,6 +32,8 @@ Check `pgdatadiff --help` @@
     Docker images are available.
-    `docker run -it davidjmarkey/pgdatadiff:0.2.1 /usr/bin/pgdatadiff`
+    ```
+    docker run -it davidjmarkey/pgdatadiff:0.2.1 /usr/bin/pgdatadiff --help
+    ```

pgdatadiff/main.py

-Original file line number
+Diff line change
@@ -1,6 +1,6 @@
     """
     Usage:
-      pgdatadiff --firstdb=<firstconnectionstring> --seconddb=<secondconnectionstring> [--schema=<schema>] [--only-data|--only-sequences] [--count-only] [--count-with-max] [--chunk-size=<size>] [--exclude-tables=<table1,table2>]
+      pgdatadiff --firstdb=<firstconnectionstring> --seconddb=<secondconnectionstring> [--schema=<schema>] [--only-data|--only-sequences] [--count-only] [--count-with-max] [--chunk-size=<size>] [--exclude-tables=<table1,table2>] [--include-tables=<table1,table2>]
       pgdatadiff --version
     Options:
@@ Expand All / @@ -11,7 +11,8 @@ @@
       --schema="public"         The schema of tables in comparison
       --only-data        Only compare data, exclude sequences
       --only-sequences   Only compare seqences, exclude data
-      --exclude-tables=""   Exclude tables from data comparison         Must be a comma separated string [default: empty string]
+      --exclude-tables=""   Exclude tables from data comparison         Must be a comma separated string
+      --include-tables=""   Only include tables in data comparison      Must be a comma separated string
       --count-only       Do a quick test based on counts alone
       --chunk-size=10000       The chunk size when comparing data [default: 10000]
       --count-with-max    Use MAX(id) when a table uses a sequence, otherwise use COUNT.
@@ Expand Down Expand Up / @@ -39,6 +40,7 @@ def main(): @@
                         count_only=arguments['--count-only'],
                         count_with_max=arguments['--count-with-max'],
                         exclude_tables=arguments['--exclude-tables'],
+                        include_tables=arguments['--include-tables'],
                         schema=arguments['--schema'])
         if not arguments['--only-sequences']:
@@ Expand Down @@

pgdatadiff/pgdatadiff.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -19,7 +19,7 @@ def make_session(connection_string):
  
    class DBDiff(object):

        def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False, count_with_max=False, exclude_tables=""):

        def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False, count_with_max=False, progress=True, exclude_tables="", include_tables=""):

            firstsession, firstengine = make_session(firstdb)

            secondsession, secondengine = make_session(seconddb)

            self.firstsession = firstsession

    @@ -33,7 +33,15 @@ def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False
  
            self.chunk_size = int(chunk_size)

            self.count_only = count_only

            self.count_with_max = count_with_max

            self.exclude_tables = exclude_tables.split(',')

            self.progress = progress

            if exclude_tables is None:

                self.exclude_tables = []

            else:

                self.exclude_tables = exclude_tables.split(',')

            if include_tables is None:

                self.include_tables = []

            else:

                self.include_tables = (include_tables or "").split(',')

            self.schema_names = self.firstinspector.get_schema_names()

            self.schema = schema or 'public'

            if self.schema not in self.schema_names:

    @@ -43,11 +51,9 @@ def __init__(self, firstdb, seconddb, schema, chunk_size=10000, count_only=False
  
        def diff_table_data(self, tablename):

            try:

                firsttable = Table(tablename, self.firstmeta, autoload=True)

                firstquery = self.firstsession.query(

                    firsttable)

                firstquery = self.firstsession.query(firsttable)

                secondtable = Table(tablename, self.secondmeta, autoload=True)

                secondquery = self.secondsession.query(

                    secondtable)

                secondquery = self.secondsession.query(secondtable)

                if self.count_with_max is True:

                    column = self.column_using_sequence(tablename)

                    pk_columns = self.firstinspector.get_pk_constraint(tablename)['constrained_columns']

    @@ -90,7 +96,7 @@ def diff_table_data(self, tablename):
  
            position = 0

            while position <= firstquery.count():

            while position <= first_count:

                firstresult = self.firstsession.execute(

                    SQL_TEMPLATE_HASH,

                    {"row_limit": self.chunk_size,

    @@ -103,8 +109,15 @@ def diff_table_data(self, tablename):
  
                    return False, f"data is different - for rows from {position} - to" \

                                  f" {position + self.chunk_size}"

                position += self.chunk_size

                self.display_progress(position, first_count)

            return True, "data is identical."

        def display_progress(self, position, first_count):

            if position > first_count:

                position = first_count

            if first_count > self.chunk_size and self.progress is True:

                print(f' Progress: {"{:2.1f}".format(position/first_count*100)}%')

        def column_using_sequence(self, tablename):

            GET_COLUMN_OF_TABLES_WITH_SEQUENCES =  f"""SELECT

                   attrib.attname AS column_name

    @@ -177,14 +190,16 @@ def diff_all_table_data(self):
  
            print(bold(red('Starting table analysis.')))

            with warnings.catch_warnings():

                warnings.simplefilter("ignore", category=sa_exc.SAWarning)

                tables = sorted(

                    self.firstinspector.get_table_names(schema=self.schema))

                tables = sorted(self.firstinspector.get_table_names(schema=self.schema))

                if len(self.include_tables) > 0:

                    # Intersection of 2 array

                    tables = [value for value in tables if value in self.include_tables]

                if len(tables) == 0:

                    print(bold(red(f'No tables found in schema: {self.schema}')))

                    return 0

                for table in tables:

                    if table in self.exclude_tables:

                        print(bold(yellow(f"Ignoring table {table}")))

                        print(bold(yellow(f"Ignoring table {table} (excluded)")))

                        continue

                    with Halo(

                            text=f"Analysing table {table}. "

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add include-tables option and progress status. #1

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Add include-tables option and progress status. #1

Are you sure you want to change the base?

Uh oh!

Add include-tables option and progress status. #1

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing