From f024cc6a5aeaef7e40294500364874c2d0a39961 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 12 Mar 2025 14:29:52 -0500 Subject: [PATCH 01/21] Exclude defaults when dumping schema to dict --- python/felis/diff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/felis/diff.py b/python/felis/diff.py index 348d751f..aab4d159 100644 --- a/python/felis/diff.py +++ b/python/felis/diff.py @@ -56,8 +56,8 @@ class SchemaDiff: """ def __init__(self, schema1: Schema, schema2: Schema): - self.dict1 = schema1.model_dump(exclude_none=True) - self.dict2 = schema2.model_dump(exclude_none=True) + self.dict1 = schema1.model_dump(exclude_none=True, exclude_defaults=True) + self.dict2 = schema2.model_dump(exclude_none=True, exclude_defaults=True) self.diff = DeepDiff(self.dict1, self.dict2, ignore_order=True) def print(self) -> None: From 3b072cefca8f9f65f5ddfff20816ab643d262e9d Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 12 Mar 2025 16:03:21 -0500 Subject: [PATCH 02/21] Fix indentation --- tests/data/test_diff1.yaml | 64 +++++++++++++++++++------------------- tests/data/test_diff2.yaml | 62 ++++++++++++++++++++---------------- 2 files changed, 67 insertions(+), 59 deletions(-) diff --git a/tests/data/test_diff1.yaml b/tests/data/test_diff1.yaml index 3a8393ac..fdccef73 100644 --- a/tests/data/test_diff1.yaml +++ b/tests/data/test_diff1.yaml @@ -4,36 +4,36 @@ description: Test diff "@id": "#test_diff" version: "1.2.3" tables: - - name: test_table1 - "@id": "#test_table1" - description: Test table 1 +- name: test_table1 + "@id": "#test_table1" + description: Test table 1 + columns: + - name: column1 + "@id": "#test_table1.column1" + datatype: int + description: Column 1 + - name: column2 + "@id": "#test_table1.column2" + datatype: string + description: Column 2 + length: 30 + nullable: false + - name: column3 + "@id": "#test_table1.column3" + datatype: string + description: Column 3 + length: 100 + indexes: + - name: idx_column1 + "@id": "#test_table1_idx_column1" + description: Index on column 1 columns: - - name: column1 - "@id": "#test_table1.column1" - datatype: int - description: Column 1 - - name: column2 - "@id": "#test_table1.column2" - datatype: string - description: Column 2 - length: 30 - nullable: false - - name: column3 - "@id": "#test_table1.column3" - datatype: string - description: Column 3 - length: 100 - indexes: - - name: idx_column1 - "@id": "#test_table1_idx_column1" - description: Index on column 1 - columns: - - "#test_table1.column1" - constraints: - - name: uniq_column2 - "@id": "#test_table1_uniq_column2" - "@type": "Unique" - description: Unique column 2 - columns: - - "#test_table1.column2" - primaryKey: "#test_table1.column1" + - "#test_table1.column1" + constraints: + - name: uniq_column2 + "@id": "#test_table1_uniq_column2" + "@type": "Unique" + description: Unique column 2 + columns: + - "#test_table1.column2" + primaryKey: "#test_table1.column1" diff --git a/tests/data/test_diff2.yaml b/tests/data/test_diff2.yaml index e667a1da..394839bd 100644 --- a/tests/data/test_diff2.yaml +++ b/tests/data/test_diff2.yaml @@ -4,31 +4,39 @@ description: Another test diff "@id": "#test_diff" version: "4.5.6" tables: - - name: test_table1 - "@id": "#test_table1" - description: Test table 1 +- name: test_table1 + "@id": "#test_table1" + description: Test table 1 + columns: + - name: column1 + "@id": "#test_table1.column1" + datatype: int + description: Column 1 + - name: column2 + "@id": "#test_table1.column2" + datatype: string + description: Column 2 + length: 30 + nullable: false + indexes: + - name: idx_column2 + "@id": "#test_table1_idx_column2" + description: Index on column 2 columns: - - name: column1 - "@id": "#test_table1.column1" - datatype: int - description: Column 1 - - name: column2 - "@id": "#test_table1.column2" - datatype: string - description: Column 2 - length: 30 - nullable: false - indexes: - - name: idx_column2 - "@id": "#test_table1_idx_column2" - description: Index on column 2 - columns: - - "#test_table1.column2" - constraints: - - name: uniq_column2 - "@id": "#test_table1_uniq_column2" - "@type": "Unique" - description: Unique column 2 - columns: - - "#test_table1.column2" - primaryKey: "#test_table1.column1" + - "#test_table1.column2" + constraints: + - name: uniq_column2 + "@id": "#test_table1_uniq_column2" + "@type": "Unique" + description: Unique column 2 + columns: + - "#test_table1.column2" + primaryKey: "#test_table1.column1" +- name: test_table2 + "@id": "#test_table2" + description: Test table 2 + columns: + - name: column1 + "@id": "#test_table2.column1" + datatype: int + description: Column 1 From 76f394e4c904fbf10069d3aa26ec3dee1a6069fe Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 12 Mar 2025 17:37:17 -0500 Subject: [PATCH 03/21] Add option for specifying a list of tables to filter on when comparing --- python/felis/cli.py | 6 ++++++ python/felis/diff.py | 43 +++++++++++++++++++++++++++++++++------- tests/test_cli.py | 47 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_diff.py | 36 ++++++++++++++++++++++++++++++++- 4 files changed, 124 insertions(+), 8 deletions(-) diff --git a/python/felis/cli.py b/python/felis/cli.py index 001786f9..03f9494a 100644 --- a/python/felis/cli.py +++ b/python/felis/cli.py @@ -431,6 +431,7 @@ def validate( default="deepdiff", ) @click.option("-E", "--error-on-change", is_flag=True, help="Exit with error code if schemas are different") +@click.option("--table", "tables", multiple=True, help="Table names to filter on.") @click.argument("files", nargs=-1, type=click.File()) @click.pass_context def diff( @@ -438,6 +439,7 @@ def diff( engine_url: str | None, comparator: str, error_on_change: bool, + tables: list[str], files: Iterable[IO[str]], ) -> None: schemas = [ @@ -447,12 +449,16 @@ def diff( diff: SchemaDiff if len(schemas) == 2 and engine_url is None: if comparator == "alembic": + if tables: + raise click.ClickException("Table filtering is not supported for Alembic comparator") db_context = create_database(schemas[0]) assert isinstance(db_context.engine, Engine) diff = DatabaseDiff(schemas[1], db_context.engine) else: diff = FormattedSchemaDiff(schemas[0], schemas[1]) elif len(schemas) == 1 and engine_url is not None: + if tables: + raise click.ClickException("Table filtering is not supported for database comparison") engine = create_engine(engine_url) diff = DatabaseDiff(schemas[0], engine) else: diff --git a/python/felis/diff.py b/python/felis/diff.py index aab4d159..6131b15c 100644 --- a/python/felis/diff.py +++ b/python/felis/diff.py @@ -21,6 +21,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import copy import logging import pprint import re @@ -53,12 +54,38 @@ class SchemaDiff: The first schema to compare. schema2 The second schema to compare. + table_filter + A list of table names to filter on. """ - def __init__(self, schema1: Schema, schema2: Schema): - self.dict1 = schema1.model_dump(exclude_none=True, exclude_defaults=True) - self.dict2 = schema2.model_dump(exclude_none=True, exclude_defaults=True) - self.diff = DeepDiff(self.dict1, self.dict2, ignore_order=True) + def __init__(self, schema1: Schema, schema2: Schema, table_filter: list[str] = []): + self.schema1 = copy.deepcopy(schema1) + self.schema2 = copy.deepcopy(schema2) + if table_filter: + logger.debug(f"Filtering on tables: {table_filter}") + self.table_filter = table_filter + self._create_diff() + + def _create_diff(self) -> dict[str, Any]: + if self.table_filter: + self.schema1.tables = [table for table in self.schema1.tables if table.name in self.table_filter] + self.schema2.tables = [table for table in self.schema2.tables if table.name in self.table_filter] + self.dict1 = self.schema1.model_dump(exclude_none=True, exclude_defaults=True) + self.dict2 = self.schema2.model_dump(exclude_none=True, exclude_defaults=True) + self._diff = DeepDiff(self.dict1, self.dict2, ignore_order=True) + return self._diff + + @property + def diff(self) -> dict[str, Any]: + """ + Return the differences between the two schemas. + + Returns + ------- + dict + The differences between the two schemas. + """ + return self._diff def print(self) -> None: """ @@ -90,10 +117,12 @@ class FormattedSchemaDiff(SchemaDiff): The first schema to compare. schema2 The second schema to compare. + table_filter + A list of table names to filter on. """ - def __init__(self, schema1: Schema, schema2: Schema): - super().__init__(schema1, schema2) + def __init__(self, schema1: Schema, schema2: Schema, table_filter: list[str] = []): + super().__init__(schema1, schema2, table_filter) def print(self) -> None: """ @@ -224,7 +253,7 @@ def __init__(self, schema: Schema, engine: Engine): connection, opts={"compare_type": True, "target_metadata": db_metadata} ) schema_metadata = MetaDataBuilder(schema, apply_schema_to_metadata=False).build() - self.diff = compare_metadata(mc, schema_metadata) + self._diff = compare_metadata(mc, schema_metadata) def print(self) -> None: """ diff --git a/tests/test_cli.py b/tests/test_cli.py index 9ec93119..82a7360b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -150,12 +150,47 @@ def test_diff_database(self) -> None: run_cli(["diff", f"--engine-url={self.sqlite_url}", test_diff2]) + def test_diff_database_with_table_filter(self) -> None: + """Test for ``diff`` command with database and table filter. This + should fail as table filters are not supported for this comparator. + """ + test_diff1 = os.path.join(TESTDIR, "data", "test_diff1.yaml") + test_diff2 = os.path.join(TESTDIR, "data", "test_diff2.yaml") + db_url = f"sqlite:///{self.tmpdir}/tap_schema.sqlite3" + + engine = create_engine(db_url) + metadata_db = MetaDataBuilder(Schema.from_uri(test_diff1), apply_schema_to_metadata=False).build() + metadata_db.create_all(engine) + + runner = CliRunner() + result = runner.invoke( + cli, ["diff", f"--engine-url={db_url}", "--table", "table1", test_diff2], catch_exceptions=False + ) + self.assertNotEqual(result.exit_code, 0) + def test_diff_alembic(self) -> None: """Test for ``diff`` command with ``--alembic`` comparator option.""" test_diff1 = os.path.join(TEST_DIR, "data", "test_diff1.yaml") test_diff2 = os.path.join(TEST_DIR, "data", "test_diff2.yaml") run_cli(["diff", "--comparator", "alembic", test_diff1, test_diff2], print_output=True) + def test_diff_alembic_with_table_filter(self) -> None: + """Test for ``diff`` command with ``--alembic`` option and a table + filter. This should fail as table filters are not supported for this + comparator. + """ + test_diff1 = os.path.join(TESTDIR, "data", "test_diff1.yaml") + test_diff2 = os.path.join(TESTDIR, "data", "test_diff2.yaml") + + runner = CliRunner() + result = runner.invoke( + cli, + ["diff", "--comparator", "alembic", "--table", "table1", test_diff1, test_diff2], + catch_exceptions=False, + ) + print(result.output) + self.assertNotEqual(result.exit_code, 0) + def test_diff_error(self) -> None: """Test for ``diff`` command with bad arguments.""" test_diff1 = os.path.join(TEST_DIR, "data", "test_diff1.yaml") @@ -167,6 +202,18 @@ def test_diff_error_on_change(self) -> None: test_diff2 = os.path.join(TEST_DIR, "data", "test_diff2.yaml") run_cli(["diff", "--error-on-change", test_diff1, test_diff2], expect_error=True, print_output=True) + def test_diff_with_table_filter(self) -> None: + """Test for ``diff`` command and a table filter.""" + test_diff1 = os.path.join(TESTDIR, "data", "test_diff1.yaml") + test_diff2 = os.path.join(TESTDIR, "data", "test_diff2.yaml") + + runner = CliRunner() + result = runner.invoke( + cli, ["diff", "--table", "table1", test_diff1, test_diff2], catch_exceptions=False + ) + print(result.output) + self.assertEqual(result.exit_code, 0) + def test_dump_yaml(self) -> None: """Test for ``dump`` command with YAML output.""" with tempfile.NamedTemporaryFile(delete=True, suffix=".yaml") as temp_file: diff --git a/tests/test_diff.py b/tests/test_diff.py index fc876aef..fb09b31d 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -20,9 +20,11 @@ # along with this program. If not, see . import unittest +from typing import Any from sqlalchemy import create_engine +from felis import Schema from felis import datamodel as dm from felis.diff import DatabaseDiff, FormattedSchemaDiff, SchemaDiff from felis.metadata import MetaDataBuilder @@ -31,7 +33,7 @@ class TestSchemaDiff(unittest.TestCase): """Test the SchemaDiff class.""" - def _diff(self, schema1, schema2): + def _diff(self, schema1: Schema, schema2: Schema) -> dict[str, Any]: return SchemaDiff(schema1, schema2).diff def test_schema_diff(self) -> None: @@ -243,6 +245,38 @@ def test_get_id_error(self) -> None: with self.assertRaises(ValueError): FormattedSchemaDiff._get_id(id_dict, keys) + def test_table_filter(self) -> None: + schema1 = dm.Schema( + name="schema1", + id="#schema1", + description="Schema 1", + tables=[ + dm.Table(name="table1", id="#table1", description="Table 1", columns=[]), + dm.Table(name="table2", id="#table2", description="Table 2", columns=[]), + dm.Table(name="table3", id="#table3", description="Table 3", columns=[]), + ], + ) + + schema2 = dm.Schema( + name="schema1", + id="#schema1", + description="Schema 1", + tables=[ + dm.Table(name="table1", id="#table1", description="Table 1", columns=[]), + dm.Table(name="table2", id="#table2", description="Table 2", columns=[]), + ], + ) + + diff = SchemaDiff(schema1, schema2, table_filter=["table1"]).diff + self.assertEqual(len(diff), 0) + + diff = SchemaDiff(schema1, schema2, table_filter=["table2"]).diff + self.assertEqual(len(diff), 0) + + diff = SchemaDiff(schema1, schema2, table_filter=["table3"]).diff + self.assertEqual(len(diff), 1) + self.assertTrue("iterable_item_removed" in diff) + class TestDatabaseDiff(unittest.TestCase): """Test the DatabaseDiff class.""" From 552be3840beac5bf3adec01f5b0701748ad47584 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 1 Jul 2025 16:59:51 -0500 Subject: [PATCH 04/21] Fix displaying the ID of the changed object --- python/felis/diff.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/python/felis/diff.py b/python/felis/diff.py index 6131b15c..4ca154f3 100644 --- a/python/felis/diff.py +++ b/python/felis/diff.py @@ -141,10 +141,8 @@ def print(self) -> None: handler(self.diff[change_type]) def _print_header(self, id_dict: dict[str, Any], keys: list[int | str]) -> None: - # id = self._get_id(id_dict, keys) - # Don't display ID here for now; it is always just the schema ID. - print(f"{self._get_key_display(keys)}") - # print(f"{id} @ {self._get_key_display(keys)}") + id = self._get_id(id_dict, keys) + print(f"'{id}'@{self._get_key_display(keys)}") def _handle_values_changed(self, changes: dict[str, Any]) -> None: for key in changes: @@ -187,24 +185,27 @@ def _handle_dictionary_item_removed(self, changes: dict[str, Any]) -> None: @staticmethod def _get_id(values: dict, keys: list[str | int]) -> str: - # Unused for now, pending updates to diff tool in DM-49446. value: list | dict = values last_id = None for key in keys: logger.debug(f"Processing key <{key}> with type {type(key)}") logger.debug(f"Type of value: {type(value)}") + + # Store the ID if current value is a dict with an 'id' field if isinstance(value, dict) and "id" in value: last_id = value["id"] + + # Navigate to the next level + if isinstance(value, dict) and key in value: + value = value[key] elif isinstance(value, list) and isinstance(key, int): if 0 <= key < len(value): value = value[key] else: raise ValueError(f"Index '{key}' is out of range for list of length {len(value)}") - value = value[key] - - if isinstance(value, dict) and "id" in value: - last_id = value["id"] + else: + raise ValueError(f"Key '{key}' not found in value of type {type(value)}") if last_id is not None: return last_id From 278e44befc6a098b69410a1f27e705474fb20251 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 1 Jul 2025 17:57:55 -0500 Subject: [PATCH 05/21] Rewrite the `diff` module module so that it emits JSON ouput --- python/felis/diff.py | 289 +++++++++++++++++++++++++++++++++---------- 1 file changed, 224 insertions(+), 65 deletions(-) diff --git a/python/felis/diff.py b/python/felis/diff.py index 4ca154f3..766cdd4d 100644 --- a/python/felis/diff.py +++ b/python/felis/diff.py @@ -22,10 +22,9 @@ # along with this program. If not, see . import copy +import json import logging -import pprint import re -from collections.abc import Callable from typing import Any from alembic.autogenerate import compare_metadata @@ -36,7 +35,7 @@ from .datamodel import Schema from .metadata import MetaDataBuilder -__all__ = ["DatabaseDiff", "SchemaDiff"] +__all__ = ["DatabaseDiff", "FormattedSchemaDiff", "SchemaDiff"] logger = logging.getLogger(__name__) @@ -50,17 +49,24 @@ class SchemaDiff: Parameters ---------- - schema1 - The first schema to compare. - schema2 - The second schema to compare. + schema_old + The old schema to compare, typically the original schema. + schema_new + The new schema to compare, typically the modified schema. table_filter A list of table names to filter on. + + Notes + ----- + This class uses DeepDiff to compare two schemas and provides methods to + retrieve the differences. It is designed to be extended for more structured + output, such as in `FormattedSchemaDiff` and would not typically be used + directly. """ - def __init__(self, schema1: Schema, schema2: Schema, table_filter: list[str] = []): - self.schema1 = copy.deepcopy(schema1) - self.schema2 = copy.deepcopy(schema2) + def __init__(self, schema_old: Schema, schema_new: Schema, table_filter: list[str] = []): + self.schema_old = copy.deepcopy(schema_old) + self.schema_new = copy.deepcopy(schema_new) if table_filter: logger.debug(f"Filtering on tables: {table_filter}") self.table_filter = table_filter @@ -68,11 +74,15 @@ def __init__(self, schema1: Schema, schema2: Schema, table_filter: list[str] = [ def _create_diff(self) -> dict[str, Any]: if self.table_filter: - self.schema1.tables = [table for table in self.schema1.tables if table.name in self.table_filter] - self.schema2.tables = [table for table in self.schema2.tables if table.name in self.table_filter] - self.dict1 = self.schema1.model_dump(exclude_none=True, exclude_defaults=True) - self.dict2 = self.schema2.model_dump(exclude_none=True, exclude_defaults=True) - self._diff = DeepDiff(self.dict1, self.dict2, ignore_order=True) + self.schema_old.tables = [ + table for table in self.schema_old.tables if table.name in self.table_filter + ] + self.schema_new.tables = [ + table for table in self.schema_new.tables if table.name in self.table_filter + ] + self.dict_old = self.schema_old.model_dump(exclude_none=True, exclude_defaults=True) + self.dict_new = self.schema_new.model_dump(exclude_none=True, exclude_defaults=True) + self._diff = DeepDiff(self.dict_old, self.dict_new, ignore_order=True) return self._diff @property @@ -87,11 +97,22 @@ def diff(self) -> dict[str, Any]: """ return self._diff + def to_change_list(self) -> list[dict[str, Any]]: + """ + Convert differences to a structured format. + + Returns + ------- + list[dict[str, Any]] + List of change dictionaries. + """ + return [] # Base implementation returns empty list + def print(self) -> None: """ - Print the differences between the two schemas. + Print the differences between the two schemas in raw format. """ - pprint.pprint(self.diff) + print(self.diff) @property def has_changes(self) -> bool: @@ -108,83 +129,197 @@ def has_changes(self) -> bool: class FormattedSchemaDiff(SchemaDiff): """ - Compare two schemas using DeepDiff and print the differences using a - customized output format. + Compare two schemas using DeepDiff and emit structured JSON differences. Parameters ---------- - schema1 - The first schema to compare. - schema2 - The second schema to compare. + schema_old + The old schema to compare, typically the original schema. + schema_new + The new schema to compare, typically the modified schema. table_filter A list of table names to filter on. + + Notes + ----- + This class extends `SchemaDiff` to provide a more structured output of + differences. It formats the differences into a list of dictionaries, each + representing a change with details such as change type, path, and values + involved. + + Output dictionaries representing the changes are formatted as follows:: + + { + "change_type": str, + "id": str, + "path": str, + "old_value": Any (for value changes), + "new_value": Any (for value changes), + "value": Any (for additions/removals) + } + + The changes can be printed to JSON using the `print` method. """ - def __init__(self, schema1: Schema, schema2: Schema, table_filter: list[str] = []): - super().__init__(schema1, schema2, table_filter) + def __init__(self, schema_old: Schema, schema_new: Schema, table_filter: list[str] = []): + super().__init__(schema_old, schema_new, table_filter) - def print(self) -> None: + def to_change_list(self) -> list[dict[str, Any]]: """ - Print the differences between the two schemas using a custom format. + Convert differences to a structured format. + + Returns + ------- + list[dict[str, Any]] + List of changes in their dictionary representation. """ - handlers: dict[str, Callable[[dict[str, Any]], None]] = { - "values_changed": self._handle_values_changed, - "iterable_item_added": self._handle_iterable_item_added, - "iterable_item_removed": self._handle_iterable_item_removed, - "dictionary_item_added": self._handle_dictionary_item_added, - "dictionary_item_removed": self._handle_dictionary_item_removed, + changes = [] + + handlers = { + "values_changed": self._collect_values_changed, + "iterable_item_added": self._collect_iterable_item_added, + "iterable_item_removed": self._collect_iterable_item_removed, + "dictionary_item_added": self._collect_dictionary_item_added, + "dictionary_item_removed": self._collect_dictionary_item_removed, } for change_type, handler in handlers.items(): if change_type in self.diff: - handler(self.diff[change_type]) + changes.extend(handler(self.diff[change_type], change_type)) + + return changes - def _print_header(self, id_dict: dict[str, Any], keys: list[int | str]) -> None: - id = self._get_id(id_dict, keys) - print(f"'{id}'@{self._get_key_display(keys)}") + def print(self) -> None: + """ + Print the differences between the two schemas as JSON. + """ + print(json.dumps(self.to_change_list(), indent=2)) + + def _get_id(self, keys: list[str | int]) -> str: + """ + Extract the most relevant ID from the path using `_find_id` and return + it. If no ID is found, return "unknown". - def _handle_values_changed(self, changes: dict[str, Any]) -> None: + Parameters + ---------- + keys : list[str | int] + The path to extract the ID from. + + Returns + ------- + str + The extracted ID. + """ + try: + return self._find_id(self.dict_old, keys) + except ValueError: + return "unknown" + + def _collect_values_changed(self, changes: dict[str, Any], change_type: str) -> list[dict[str, Any]]: + """Collect value change differences.""" + results = [] for key in changes: keys = self._parse_deepdiff_path(key) - value1 = changes[key]["old_value"] - value2 = changes[key]["new_value"] - self._print_header(self.dict1, keys) - print(f"- {value1}") - print(f"+ {value2}") + id_value = self._get_id(keys) + + results.append( + { + "change_type": change_type, + "id": id_value, + "path": self._get_key_display(keys), + "old_value": changes[key]["old_value"], + "new_value": changes[key]["new_value"], + } + ) + return results - def _handle_iterable_item_added(self, changes: dict[str, Any]) -> None: + def _collect_iterable_item_added(self, changes: dict[str, Any], change_type: str) -> list[dict[str, Any]]: + """Collect iterable item addition differences.""" + results = [] for key in changes: keys = self._parse_deepdiff_path(key) - value = changes[key] - self._print_header(self.dict2, keys) - print(f"+ {value}") + id_value = self._get_id(keys) + + results.append( + { + "change_type": change_type, + "id": id_value, + "path": self._get_key_display(keys), + "value": changes[key], + } + ) + return results - def _handle_iterable_item_removed(self, changes: dict[str, Any]) -> None: + def _collect_iterable_item_removed( + self, changes: dict[str, Any], change_type: str + ) -> list[dict[str, Any]]: + """Collect iterable item removal differences.""" + results = [] for key in changes: keys = self._parse_deepdiff_path(key) - value = changes[key] - self._print_header(self.dict1, keys) - print(f"- {value}") + id_value = self._get_id(keys) + + results.append( + { + "change_type": change_type, + "id": id_value, + "path": self._get_key_display(keys), + "value": changes[key], + } + ) + return results - def _handle_dictionary_item_added(self, changes: dict[str, Any]) -> None: + def _collect_dictionary_item_added( + self, changes: dict[str, Any], change_type: str + ) -> list[dict[str, Any]]: + """Collect dictionary item addition differences.""" + results = [] for key in changes: keys = self._parse_deepdiff_path(key) - value = keys[-1] - keys.pop() - self._print_header(self.dict2, keys) - print(f"+ {value}") - def _handle_dictionary_item_removed(self, changes: dict[str, Any]) -> None: + added_key = keys[-1] + parent_keys = keys[:-1] + id_value = self._get_id(keys) + + results.append( + { + "change_type": change_type, + "id": id_value, + "path": self._get_key_display(parent_keys), + "added_key": added_key, + "value": changes[key], + } + ) + return results + + def _collect_dictionary_item_removed( + self, changes: dict[str, Any], change_type: str + ) -> list[dict[str, Any]]: + """Collect dictionary item removal differences.""" + results = [] for key in changes: keys = self._parse_deepdiff_path(key) - value = keys[-1] - keys.pop() - self._print_header(self.dict1, keys) - print(f"- {value}") + removed_key = keys[-1] + parent_keys = keys[:-1] + + id_value = self._get_id(keys) + + results.append( + { + "change_type": change_type, + "id": id_value, + "path": self._get_key_display(parent_keys), + "removed_key": removed_key, + "value": changes[key], + } + ) + return results @staticmethod - def _get_id(values: dict, keys: list[str | int]) -> str: + def _find_id(values: dict, keys: list[str | int]) -> str: + """Extract the most relevant ID from the path, usually the last 'id' + found. + """ value: list | dict = values last_id = None @@ -214,10 +349,12 @@ def _get_id(values: dict, keys: list[str | int]) -> str: @staticmethod def _get_key_display(keys: list[str | int]) -> str: + """Convert keys list to a dot-notation path.""" return ".".join(str(k) for k in keys) @staticmethod def _parse_deepdiff_path(path: str) -> list[str | int]: + """Parse a DeepDiff path into a list of keys.""" if path.startswith("root"): path = path[4:] @@ -236,7 +373,7 @@ def _parse_deepdiff_path(path: str) -> list[str | int]: class DatabaseDiff(SchemaDiff): """ - Compare a schema with a database and print the differences. + Compare a schema with a database and emit structured differences. Parameters ---------- @@ -256,9 +393,31 @@ def __init__(self, schema: Schema, engine: Engine): schema_metadata = MetaDataBuilder(schema, apply_schema_to_metadata=False).build() self._diff = compare_metadata(mc, schema_metadata) + def to_change_list(self) -> list[dict[str, Any]]: + """ + Convert database differences to structured format. + + Returns + ------- + list[dict[str, Any]] + List of database change dictionaries. + """ + changes = [] + for change in self._diff: + changes.append( + { + "change_type": "database_diff", + "operation": str(change[0]) if change else "unknown", + "details": str(change) if change else "no details", + } + ) + return changes + def print(self) -> None: """ - Print the differences between the schema and the database. + Print the differences between the schema and the database as JSON. """ if self.has_changes: - pprint.pprint(self.diff) + print(json.dumps(self.to_change_list(), indent=2)) + else: + print("[]") From d7783d6405060b9a6b41f06ac46cc2a690ef1dca Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 1 Jul 2025 18:02:37 -0500 Subject: [PATCH 06/21] Add support for printing diff output to a file --- python/felis/cli.py | 10 +++- python/felis/diff.py | 115 +++++++++++++++++++++++-------------------- tests/test_diff.py | 26 +++++----- 3 files changed, 84 insertions(+), 67 deletions(-) diff --git a/python/felis/cli.py b/python/felis/cli.py index 03f9494a..d7d2d679 100644 --- a/python/felis/cli.py +++ b/python/felis/cli.py @@ -432,6 +432,13 @@ def validate( ) @click.option("-E", "--error-on-change", is_flag=True, help="Exit with error code if schemas are different") @click.option("--table", "tables", multiple=True, help="Table names to filter on.") +@click.option( + "--output-file", + "-o", + type=click.File(mode="w"), + help="Write diff output to a file insteading of printing", + default=None, +) @click.argument("files", nargs=-1, type=click.File()) @click.pass_context def diff( @@ -440,6 +447,7 @@ def diff( comparator: str, error_on_change: bool, tables: list[str], + output_file: IO[str] | None, files: Iterable[IO[str]], ) -> None: schemas = [ @@ -466,7 +474,7 @@ def diff( "Invalid arguments - provide two schemas or a schema and a database engine URL" ) - diff.print() + diff.print(output_file) if diff.has_changes and error_on_change: raise click.ClickException("Schema was changed") diff --git a/python/felis/diff.py b/python/felis/diff.py index 766cdd4d..9f991f17 100644 --- a/python/felis/diff.py +++ b/python/felis/diff.py @@ -25,7 +25,7 @@ import json import logging import re -from typing import Any +from typing import IO, Any from alembic.autogenerate import compare_metadata from alembic.migration import MigrationContext @@ -108,11 +108,16 @@ def to_change_list(self) -> list[dict[str, Any]]: """ return [] # Base implementation returns empty list - def print(self) -> None: + def print(self, output_stream: IO[str] | None = None) -> None: """ Print the differences between the two schemas in raw format. + + Parameters + ---------- + output_stream + The output stream for printing the differences. """ - print(self.diff) + print(self.diff, file=output_stream) @property def has_changes(self) -> bool: @@ -179,23 +184,28 @@ def to_change_list(self) -> list[dict[str, Any]]: "values_changed": self._collect_values_changed, "iterable_item_added": self._collect_iterable_item_added, "iterable_item_removed": self._collect_iterable_item_removed, - "dictionary_item_added": self._collect_dictionary_item_added, + "dictionary_item_added": lambda paths: self._collect_dictionary_item_added(paths), "dictionary_item_removed": self._collect_dictionary_item_removed, } for change_type, handler in handlers.items(): if change_type in self.diff: - changes.extend(handler(self.diff[change_type], change_type)) + changes.extend(handler(self.diff[change_type])) return changes - def print(self) -> None: + def print(self, output_stream: IO[str] | None = None) -> None: """ Print the differences between the two schemas as JSON. + + Parameters + ---------- + output_stream + The output stream for printing the differences. """ - print(json.dumps(self.to_change_list(), indent=2)) + print(json.dumps(self.to_change_list(), indent=2), file=output_stream) - def _get_id(self, keys: list[str | int]) -> str: + def _get_id(self, source_dict: dict, keys: list[str | int]) -> str: """ Extract the most relevant ID from the path using `_find_id` and return it. If no ID is found, return "unknown". @@ -211,21 +221,19 @@ def _get_id(self, keys: list[str | int]) -> str: The extracted ID. """ try: - return self._find_id(self.dict_old, keys) + return self._find_id(source_dict, keys) except ValueError: return "unknown" - def _collect_values_changed(self, changes: dict[str, Any], change_type: str) -> list[dict[str, Any]]: + def _collect_values_changed(self, changes: dict[str, Any]) -> list[dict[str, Any]]: """Collect value change differences.""" results = [] for key in changes: keys = self._parse_deepdiff_path(key) - id_value = self._get_id(keys) - results.append( { - "change_type": change_type, - "id": id_value, + "change_type": "values_changed", + "id": self._get_id(self.dict_old, keys), "path": self._get_key_display(keys), "old_value": changes[key]["old_value"], "new_value": changes[key]["new_value"], @@ -233,81 +241,78 @@ def _collect_values_changed(self, changes: dict[str, Any], change_type: str) -> ) return results - def _collect_iterable_item_added(self, changes: dict[str, Any], change_type: str) -> list[dict[str, Any]]: + def _collect_iterable_item_added(self, changes: dict[str, Any]) -> list[dict[str, Any]]: """Collect iterable item addition differences.""" results = [] for key in changes: keys = self._parse_deepdiff_path(key) - id_value = self._get_id(keys) - results.append( { - "change_type": change_type, - "id": id_value, + "change_type": "iterable_item_added", + "id": self._get_id(self.dict_new, keys), "path": self._get_key_display(keys), "value": changes[key], } ) return results - def _collect_iterable_item_removed( - self, changes: dict[str, Any], change_type: str - ) -> list[dict[str, Any]]: + def _collect_iterable_item_removed(self, changes: dict[str, Any]) -> list[dict[str, Any]]: """Collect iterable item removal differences.""" results = [] for key in changes: keys = self._parse_deepdiff_path(key) - id_value = self._get_id(keys) - results.append( { - "change_type": change_type, - "id": id_value, + "change_type": "iterable_item_removed", + "id": self._get_id(self.dict_old, keys), "path": self._get_key_display(keys), "value": changes[key], } ) return results - def _collect_dictionary_item_added( - self, changes: dict[str, Any], change_type: str - ) -> list[dict[str, Any]]: - """Collect dictionary item addition differences.""" - results = [] - for key in changes: - keys = self._parse_deepdiff_path(key) + @classmethod + def _get_value_from_key(cls, data: Any, keys: list[str | int]) -> Any: + for key in keys: + data = data[key] # step through nested dicts/lists + return data + def _collect_dictionary_item_added(self, paths: list[str]) -> list[dict[str, Any]]: + """Collect dictionary item addition differences from DeepDiff path + list. + """ + results = [] + for path in paths: + keys = self._parse_deepdiff_path(path) added_key = keys[-1] parent_keys = keys[:-1] - id_value = self._get_id(keys) - + try: + value = self._get_value_from_key(self.dict_new, keys) + except (KeyError, IndexError, TypeError): + logger.warning(f"Could not resolve value for path: {path}") + value = None results.append( { - "change_type": change_type, - "id": id_value, + "change_type": "dictionary_item_added", + "id": self._get_id(self.dict_new, keys), "path": self._get_key_display(parent_keys), "added_key": added_key, - "value": changes[key], + "value": value, } ) return results - def _collect_dictionary_item_removed( - self, changes: dict[str, Any], change_type: str - ) -> list[dict[str, Any]]: + def _collect_dictionary_item_removed(self, changes: dict[str, Any]) -> list[dict[str, Any]]: """Collect dictionary item removal differences.""" results = [] for key in changes: keys = self._parse_deepdiff_path(key) removed_key = keys[-1] parent_keys = keys[:-1] - - id_value = self._get_id(keys) - results.append( { - "change_type": change_type, - "id": id_value, + "change_type": "dictionary_item_removed", + "id": self._get_id(self.dict_old, keys), "path": self._get_key_display(parent_keys), "removed_key": removed_key, "value": changes[key], @@ -349,8 +354,10 @@ def _find_id(values: dict, keys: list[str | int]) -> str: @staticmethod def _get_key_display(keys: list[str | int]) -> str: - """Convert keys list to a dot-notation path.""" - return ".".join(str(k) for k in keys) + """Convert keys list to a dot-notation path. If no keys are provided, + we assume the root path. + """ + return ".".join(str(k) for k in keys) if keys else "root" @staticmethod def _parse_deepdiff_path(path: str) -> list[str | int]: @@ -413,11 +420,13 @@ def to_change_list(self) -> list[dict[str, Any]]: ) return changes - def print(self) -> None: + def print(self, output_stream: IO[str] | None = None) -> None: """ Print the differences between the schema and the database as JSON. + + Parameters + ---------- + output_stream + The output stream for printing the differences. """ - if self.has_changes: - print(json.dumps(self.to_change_list(), indent=2)) - else: - print("[]") + print(json.dumps(self.to_change_list(), indent=2), file=output_stream) diff --git a/tests/test_diff.py b/tests/test_diff.py index fb09b31d..4c1e7e8f 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -37,7 +37,10 @@ def _diff(self, schema1: Schema, schema2: Schema) -> dict[str, Any]: return SchemaDiff(schema1, schema2).diff def test_schema_diff(self) -> None: - """Test the comparison output generated by the SchemaDiff class.""" + """Test the comparison output generated by the SchemaDiff class. + Since the handler functions are called by dispatch in the SchemaDiff + class, we activate them manually here to ensure test coverage. + """ # Two schemas with different values schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) schema2 = dm.Schema(name="schema2", id="#schema2", version="4.5.6", description="Schema 2", tables=[]) @@ -48,7 +51,7 @@ def test_schema_diff(self) -> None: ) # Call formatted handler function - FormattedSchemaDiff(schema1, schema2)._handle_values_changed(diff["values_changed"]) + FormattedSchemaDiff(schema1, schema2)._collect_values_changed(diff["values_changed"]) # Table added schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) @@ -57,7 +60,7 @@ def test_schema_diff(self) -> None: self.assertIn("root['tables'][0]", diff["iterable_item_added"]) # Call formatted handler function - FormattedSchemaDiff(schema1, schema2)._handle_iterable_item_added(diff["iterable_item_added"]) + FormattedSchemaDiff(schema1, schema2)._collect_iterable_item_added(diff["iterable_item_added"]) # Table removed schema2.tables.clear() @@ -67,7 +70,7 @@ def test_schema_diff(self) -> None: self.assertIn("root['tables'][0]", diff["iterable_item_removed"]) # Call formatted handler function - FormattedSchemaDiff(schema1, schema2)._handle_iterable_item_removed(diff["iterable_item_removed"]) + FormattedSchemaDiff(schema1, schema2)._collect_iterable_item_removed(diff["iterable_item_removed"]) # Different table descriptions schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -167,7 +170,8 @@ def test_schema_diff(self) -> None: self.assertIn("root['tables'][0]['columns'][0]['ivoa_ucd']", diff["dictionary_item_added"]) # Call formatted handler function - FormattedSchemaDiff(schema1, schema2)._handle_dictionary_item_added(diff["dictionary_item_added"]) + print("Diff:", diff) + FormattedSchemaDiff(schema1, schema2)._collect_dictionary_item_added(diff["dictionary_item_added"]) # Removed a field from a column schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -192,7 +196,9 @@ def test_schema_diff(self) -> None: self.assertIn("root['tables'][0]['columns'][0]['ivoa_ucd']", diff["dictionary_item_removed"]) # Call formatted handler function - FormattedSchemaDiff(schema1, schema2)._handle_dictionary_item_removed(diff["dictionary_item_removed"]) + FormattedSchemaDiff(schema1, schema2)._collect_dictionary_item_removed( + diff["dictionary_item_removed"] + ) def test_index_diff(self) -> None: """Test differences in indices between tables.""" @@ -221,7 +227,7 @@ def test_index_diff(self) -> None: self.assertEqual(old_value, "column1") self.assertEqual(new_value, "column2") - # Print formatted diff to make sure it works for these changes + # Print the diff to ensure it works without errors FormattedSchemaDiff(schema1, schema2).print() def test_print(self) -> None: @@ -239,12 +245,6 @@ def test_parse_deepdiff_path(self) -> None: keys = FormattedSchemaDiff._parse_deepdiff_path(path) self.assertListEqual(keys, ["tables", 0, "columns", 0, "ivoa_ucd"]) - def test_get_id_error(self) -> None: - id_dict = {"tables": [{"indexes": [{"columns": [{"name": "column1"}, {"name": "column2"}]}]}]} - keys = ["tables", 0, "indexes", 0, "columns", 0] - with self.assertRaises(ValueError): - FormattedSchemaDiff._get_id(id_dict, keys) - def test_table_filter(self) -> None: schema1 = dm.Schema( name="schema1", From a9706e197da58033d8dde34350072c36d539fc5d Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 2 Jul 2025 13:25:08 -0500 Subject: [PATCH 07/21] Add click to intersphinx links --- docs/documenteer.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/documenteer.toml b/docs/documenteer.toml index ccccf045..1970f07f 100644 --- a/docs/documenteer.toml +++ b/docs/documenteer.toml @@ -34,3 +34,4 @@ python_api_dir = "dev/internals" python = "https://docs.python.org/3" sqlalchemy = "https://docs.sqlalchemy.org/en/latest" lsst = "https://pipelines.lsst.io/v/weekly" +click = "https://click.palletsprojects.com" From 7360a993a04792060ff75eb3c89433b8ad70d524 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 2 Jul 2025 13:25:29 -0500 Subject: [PATCH 08/21] Return the command result from run_cli() function --- python/felis/tests/run_cli.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/felis/tests/run_cli.py b/python/felis/tests/run_cli.py index 04102fdf..4c022d0b 100644 --- a/python/felis/tests/run_cli.py +++ b/python/felis/tests/run_cli.py @@ -23,6 +23,7 @@ import logging +import click from click.testing import CliRunner from felis.cli import cli @@ -38,7 +39,7 @@ def run_cli( print_cmd: bool = False, print_output: bool = False, id_generation: bool = False, -) -> None: +) -> click.testing.Result: """Run a CLI command and check the exit code. Parameters @@ -57,6 +58,11 @@ def run_cli( Whether to print the output, by default False. id_generation : bool Whether to enable id generation, by default False. + + Returns + ------- + click.testing.Result + The result of the command execution. """ if not cmd: raise ValueError("No command provided.") @@ -77,3 +83,4 @@ def run_cli( assert result.exit_code != 0 else: assert result.exit_code == 0 + return result From 57727b1c9065bb736c62f4204495d40e1514b767 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 2 Jul 2025 13:26:14 -0500 Subject: [PATCH 09/21] Fix cli tests after updating diff module --- tests/test_cli.py | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 82a7360b..55c7b615 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -154,19 +154,18 @@ def test_diff_database_with_table_filter(self) -> None: """Test for ``diff`` command with database and table filter. This should fail as table filters are not supported for this comparator. """ - test_diff1 = os.path.join(TESTDIR, "data", "test_diff1.yaml") - test_diff2 = os.path.join(TESTDIR, "data", "test_diff2.yaml") - db_url = f"sqlite:///{self.tmpdir}/tap_schema.sqlite3" + test_diff1 = os.path.join(TEST_DIR, "data", "test_diff1.yaml") + test_diff2 = os.path.join(TEST_DIR, "data", "test_diff2.yaml") + db_url = f"sqlite:///{self.tmpdir}/tap_schema.sqlite3" engine = create_engine(db_url) metadata_db = MetaDataBuilder(Schema.from_uri(test_diff1), apply_schema_to_metadata=False).build() metadata_db.create_all(engine) - runner = CliRunner() - result = runner.invoke( - cli, ["diff", f"--engine-url={db_url}", "--table", "table1", test_diff2], catch_exceptions=False + run_cli( + ["diff", f"--engine-url={db_url}", "--table", "table1", test_diff2], + expect_error=True, ) - self.assertNotEqual(result.exit_code, 0) def test_diff_alembic(self) -> None: """Test for ``diff`` command with ``--alembic`` comparator option.""" @@ -179,17 +178,13 @@ def test_diff_alembic_with_table_filter(self) -> None: filter. This should fail as table filters are not supported for this comparator. """ - test_diff1 = os.path.join(TESTDIR, "data", "test_diff1.yaml") - test_diff2 = os.path.join(TESTDIR, "data", "test_diff2.yaml") + test_diff1 = os.path.join(TEST_DIR, "data", "test_diff1.yaml") + test_diff2 = os.path.join(TEST_DIR, "data", "test_diff2.yaml") - runner = CliRunner() - result = runner.invoke( - cli, + run_cli( ["diff", "--comparator", "alembic", "--table", "table1", test_diff1, test_diff2], - catch_exceptions=False, + expect_error=True, ) - print(result.output) - self.assertNotEqual(result.exit_code, 0) def test_diff_error(self) -> None: """Test for ``diff`` command with bad arguments.""" @@ -204,15 +199,10 @@ def test_diff_error_on_change(self) -> None: def test_diff_with_table_filter(self) -> None: """Test for ``diff`` command and a table filter.""" - test_diff1 = os.path.join(TESTDIR, "data", "test_diff1.yaml") - test_diff2 = os.path.join(TESTDIR, "data", "test_diff2.yaml") + test_diff1 = os.path.join(TEST_DIR, "data", "test_diff1.yaml") + test_diff2 = os.path.join(TEST_DIR, "data", "test_diff2.yaml") - runner = CliRunner() - result = runner.invoke( - cli, ["diff", "--table", "table1", test_diff1, test_diff2], catch_exceptions=False - ) - print(result.output) - self.assertEqual(result.exit_code, 0) + run_cli(["diff", "--table", "table1", test_diff1, test_diff2], print_output=True) def test_dump_yaml(self) -> None: """Test for ``dump`` command with YAML output.""" From c667b6f0139bbb7ca574c3e30df69d7631557e59 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 2 Jul 2025 17:15:14 -0500 Subject: [PATCH 10/21] Improve the error handling when generating a diff against a database --- python/felis/diff.py | 71 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/python/felis/diff.py b/python/felis/diff.py index 9f991f17..66f9b053 100644 --- a/python/felis/diff.py +++ b/python/felis/diff.py @@ -27,6 +27,7 @@ import re from typing import IO, Any +import sqlalchemy from alembic.autogenerate import compare_metadata from alembic.migration import MigrationContext from deepdiff.diff import DeepDiff @@ -388,17 +389,73 @@ class DatabaseDiff(SchemaDiff): The schema to compare. engine The database engine to compare with. + + Notes + ----- + The `DatabaseDiff` class uses SQLAlchemy to reflect the database schema + and compare it with the provided `~felis.datamodel.Schema` object. It + generates a list of differences between the two schemas, which can be + printed or converted to a structured format. + + The error-handling during the reflection and comparison process is + robust, catching various exceptions that may arise from database + connectivity issues, invalid configurations, or unexpected errors. + This is done because otherwise some obscure errors may be raised + during the reflection process and configuration of alembic, which are not + very informative to the user. """ def __init__(self, schema: Schema, engine: Engine): + self.schema = schema + self.engine = engine + self._generate_diff() + + def _generate_diff(self) -> None: + """Generate the differences between the provided schema and + database. + """ db_metadata = MetaData() - with engine.connect() as connection: - db_metadata.reflect(bind=connection) - mc = MigrationContext.configure( - connection, opts={"compare_type": True, "target_metadata": db_metadata} - ) - schema_metadata = MetaDataBuilder(schema, apply_schema_to_metadata=False).build() - self._diff = compare_metadata(mc, schema_metadata) + with self.engine.connect() as connection: + # Reflect the database schema + try: + db_metadata.reflect(bind=connection) + except (sqlalchemy.exc.DatabaseError, sqlalchemy.exc.OperationalError) as e: + raise RuntimeError(f"Database reflection failed: {e}") from e + except AttributeError as e: # Happens when no database is provided in the URL + raise ValueError( + f"Invalid engine URL: <{self.engine.url}> (Missing database or schema?)" + ) from e + except sqlalchemy.exc.ArgumentError as e: + raise ValueError(f"Invalid database URL or configuration: {e}") from e + except Exception as e: + raise RuntimeError(f"Unexpected error during database reflection: {e}") from e + + # Configure the alembic migration context using the reflected + # metadata + try: + mc = MigrationContext.configure( + connection, opts={"compare_type": True, "target_metadata": db_metadata} + ) + except (sqlalchemy.exc.DatabaseError, TypeError, ValueError) as e: + raise RuntimeError(f"Migration context configuration failed: {e}") from e + except Exception as e: + raise RuntimeError(f"Unexpected error in migration context configuration: {e}") from e + + # Build the schema metadata for comparison + try: + schema_metadata = MetaDataBuilder(self.schema, apply_schema_to_metadata=False).build() + except (ValueError, TypeError) as e: + raise ValueError(f"Schema metadata construction failed: {e}") from e + except Exception as e: + raise RuntimeError(f"Unexpected error in schema metadata construction: {e}") from e + + # Compare the database metadata with the schema metadata + try: + self._diff = compare_metadata(mc, schema_metadata) + except (sqlalchemy.exc.DatabaseError, AttributeError, TypeError) as e: + raise RuntimeError(f"Metadata comparison failed: {e}") from e + except Exception as e: + raise RuntimeError(f"Unexpected error during metadata comparison: {e}") from e def to_change_list(self) -> list[dict[str, Any]]: """ From efd349fdbfd97c8efaffe90e7d563e48351a8f0a Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 2 Jul 2025 18:08:02 -0500 Subject: [PATCH 11/21] Add import of data model classes --- python/felis/__init__.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/felis/__init__.py b/python/felis/__init__.py index bc196a61..5c59224f 100644 --- a/python/felis/__init__.py +++ b/python/felis/__init__.py @@ -19,7 +19,19 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from .datamodel import Schema +from .datamodel import ( + CheckConstraint, + Column, + ColumnGroup, + Constraint, + DataType, + ForeignKeyConstraint, + Index, + Schema, + SchemaVersion, + Table, + UniqueConstraint, +) from .db.schema import create_database from .diff import DatabaseDiff, FormattedSchemaDiff, SchemaDiff from .metadata import MetaDataBuilder From c56ba05c036efd7375ddd1cd29d53b34ea3bb8e7 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 3 Jul 2025 12:09:55 -0500 Subject: [PATCH 12/21] Remove SchemaDiff from felis imports --- python/felis/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/felis/__init__.py b/python/felis/__init__.py index 5c59224f..2849b5ff 100644 --- a/python/felis/__init__.py +++ b/python/felis/__init__.py @@ -33,7 +33,7 @@ UniqueConstraint, ) from .db.schema import create_database -from .diff import DatabaseDiff, FormattedSchemaDiff, SchemaDiff +from .diff import DatabaseDiff, FormattedSchemaDiff from .metadata import MetaDataBuilder from importlib.metadata import PackageNotFoundError, version From e117b0257baac955d760f77c3a6296aede2d3693 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 9 Jul 2025 13:56:25 -0500 Subject: [PATCH 13/21] Add an additional attribute to diff test schema --- tests/data/test_diff2.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/data/test_diff2.yaml b/tests/data/test_diff2.yaml index 394839bd..a1e51020 100644 --- a/tests/data/test_diff2.yaml +++ b/tests/data/test_diff2.yaml @@ -3,6 +3,7 @@ name: test_diff description: Another test diff "@id": "#test_diff" version: "4.5.6" +votable:utype: Schema tables: - name: test_table1 "@id": "#test_table1" @@ -12,6 +13,7 @@ tables: "@id": "#test_table1.column1" datatype: int description: Column 1 + ivoa:ucd: meta.id - name: column2 "@id": "#test_table1.column2" datatype: string From 31b909851972c36dbbef46d8773ee3c37fcc7447 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 9 Jul 2025 14:03:10 -0500 Subject: [PATCH 14/21] Update the display of the path --- python/felis/diff.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/python/felis/diff.py b/python/felis/diff.py index 66f9b053..a47e76c5 100644 --- a/python/felis/diff.py +++ b/python/felis/diff.py @@ -235,7 +235,7 @@ def _collect_values_changed(self, changes: dict[str, Any]) -> list[dict[str, Any { "change_type": "values_changed", "id": self._get_id(self.dict_old, keys), - "path": self._get_key_display(keys), + "path": self._get_display_path(keys), "old_value": changes[key]["old_value"], "new_value": changes[key]["new_value"], } @@ -251,7 +251,7 @@ def _collect_iterable_item_added(self, changes: dict[str, Any]) -> list[dict[str { "change_type": "iterable_item_added", "id": self._get_id(self.dict_new, keys), - "path": self._get_key_display(keys), + "path": self._get_display_path(keys), "value": changes[key], } ) @@ -266,7 +266,7 @@ def _collect_iterable_item_removed(self, changes: dict[str, Any]) -> list[dict[s { "change_type": "iterable_item_removed", "id": self._get_id(self.dict_old, keys), - "path": self._get_key_display(keys), + "path": self._get_display_path(keys), "value": changes[key], } ) @@ -296,7 +296,7 @@ def _collect_dictionary_item_added(self, paths: list[str]) -> list[dict[str, Any { "change_type": "dictionary_item_added", "id": self._get_id(self.dict_new, keys), - "path": self._get_key_display(parent_keys), + "path": self._get_display_path(parent_keys), "added_key": added_key, "value": value, } @@ -314,7 +314,7 @@ def _collect_dictionary_item_removed(self, changes: dict[str, Any]) -> list[dict { "change_type": "dictionary_item_removed", "id": self._get_id(self.dict_old, keys), - "path": self._get_key_display(parent_keys), + "path": self._get_display_path(parent_keys), "removed_key": removed_key, "value": changes[key], } @@ -354,11 +354,19 @@ def _find_id(values: dict, keys: list[str | int]) -> str: raise ValueError("No 'id' found in the specified path") @staticmethod - def _get_key_display(keys: list[str | int]) -> str: - """Convert keys list to a dot-notation path. If no keys are provided, - we assume the root path. + def _get_display_path(keys: list[str | int]) -> str: + """Convert keys list to a dot-notation path. + + - If the key is at the root level (e.g., ['description']), return + 'root.description'. + - If keys are empty, return 'root'. + - Otherwise, join keys with dot notation. """ - return ".".join(str(k) for k in keys) if keys else "root" + if not keys: + return "root" + if len(keys) == 1: + return f"root.{keys[0]}" + return ".".join(str(k) for k in keys) @staticmethod def _parse_deepdiff_path(path: str) -> list[str | int]: From f2196fb700dee2171d1dcd1d10d75e93e1aa119a Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 9 Jul 2025 16:10:58 -0500 Subject: [PATCH 15/21] Update diff test files --- tests/data/test_diff1.yaml | 17 ++++++++++------- tests/data/test_diff2.yaml | 10 ++++++++++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/tests/data/test_diff1.yaml b/tests/data/test_diff1.yaml index fdccef73..e2f5fcdc 100644 --- a/tests/data/test_diff1.yaml +++ b/tests/data/test_diff1.yaml @@ -1,37 +1,40 @@ --- name: test_diff description: Test diff -"@id": "#test_diff" version: "1.2.3" tables: - name: test_table1 - "@id": "#test_table1" description: Test table 1 columns: - name: column1 - "@id": "#test_table1.column1" datatype: int description: Column 1 - name: column2 - "@id": "#test_table1.column2" datatype: string description: Column 2 length: 30 nullable: false - name: column3 - "@id": "#test_table1.column3" datatype: string description: Column 3 length: 100 indexes: - name: idx_column1 - "@id": "#test_table1_idx_column1" description: Index on column 1 columns: - "#test_table1.column1" + columnGroups: + - name: group1 + description: Group 1 + columns: + - "#test_table1.column1" + - name: group2 + description: Group 2 + columns: + - "#test_table1.column1" + - "#test_table1.column2" constraints: - name: uniq_column2 - "@id": "#test_table1_uniq_column2" "@type": "Unique" description: Unique column 2 columns: diff --git a/tests/data/test_diff2.yaml b/tests/data/test_diff2.yaml index a1e51020..a2406a25 100644 --- a/tests/data/test_diff2.yaml +++ b/tests/data/test_diff2.yaml @@ -26,6 +26,16 @@ tables: description: Index on column 2 columns: - "#test_table1.column2" + columnGroups: + - name: group1 + description: Group 1 + columns: + - "#test_table1.column1" + - "#test_table1.column2" + - name: group2 + description: Group 2 + columns: + - "#test_table1.column1" constraints: - name: uniq_column2 "@id": "#test_table1_uniq_column2" From 0758e5fcc6d5f2cb587a3e66983b0e57de801324 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 9 Jul 2025 16:09:48 -0500 Subject: [PATCH 16/21] Refactor diff module to use classes for handling diff changes Replace the call map with a class for handling each type of change. --- python/felis/diff.py | 365 +++++++++++++++++++------------------------ 1 file changed, 165 insertions(+), 200 deletions(-) diff --git a/python/felis/diff.py b/python/felis/diff.py index a47e76c5..427e84a5 100644 --- a/python/felis/diff.py +++ b/python/felis/diff.py @@ -21,16 +21,15 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import copy import json import logging -import re from typing import IO, Any import sqlalchemy from alembic.autogenerate import compare_metadata from alembic.migration import MigrationContext from deepdiff.diff import DeepDiff +from deepdiff.model import DiffLevel from sqlalchemy import Engine, MetaData from .datamodel import Schema @@ -44,6 +43,46 @@ logging.getLogger("alembic").setLevel(logging.WARNING) +def _normalize_lists_by_name(obj: Any) -> Any: + """ + Recursively normalize structures: + - Lists of dicts under specified keys become dicts keyed by 'name'. + - Lists of strings under specified keys become sorted lists. + - Everything else is recursively normalized in place. + + Parameters + ---------- + obj + The object to normalize, which can be a list, dict, or any other type. + """ + dict_like_keys = {"tables", "columns", "constraints", "indexes", "column_groups"} + set_like_keys = {"columns", "referencedColumns"} + + if isinstance(obj, list): + return [_normalize_lists_by_name(item) for item in obj] + + elif isinstance(obj, dict): + normalized: dict[str, Any] = {} + + for k, v in obj.items(): + if isinstance(v, list): + if k in dict_like_keys and all(isinstance(i, dict) and "name" in i for i in v): + logger.debug(f"Normalizing list of dicts under key '{k}' to dict keyed by 'name'") + normalized[k] = {i["name"]: _normalize_lists_by_name(i) for i in v} + elif k in set_like_keys and all(isinstance(i, str) for i in v): + logger.debug(f"Normalizing list of strings under key '{k}' to sorted list: {v}") + normalized[k] = sorted(v) + else: + normalized[k] = [_normalize_lists_by_name(i) for i in v] + else: + normalized[k] = _normalize_lists_by_name(v) + + return normalized + + else: + return obj + + class SchemaDiff: """ Compare two schemas using DeepDiff and print the differences. @@ -56,6 +95,8 @@ class SchemaDiff: The new schema to compare, typically the modified schema. table_filter A list of table names to filter on. + strip_ids + Whether to strip '@id' fields from the schemas before comparison. Notes ----- @@ -65,12 +106,19 @@ class SchemaDiff: directly. """ - def __init__(self, schema_old: Schema, schema_new: Schema, table_filter: list[str] = []): - self.schema_old = copy.deepcopy(schema_old) - self.schema_new = copy.deepcopy(schema_new) + def __init__( + self, + schema_old: Schema, + schema_new: Schema, + table_filter: list[str] | None = None, + strip_ids: bool = True, + ): + self.schema_old = schema_old + self.schema_new = schema_new if table_filter: logger.debug(f"Filtering on tables: {table_filter}") - self.table_filter = table_filter + self.table_filter = table_filter or [] + self.strip_ids = strip_ids self._create_diff() def _create_diff(self) -> dict[str, Any]: @@ -81,9 +129,16 @@ def _create_diff(self) -> dict[str, Any]: self.schema_new.tables = [ table for table in self.schema_new.tables if table.name in self.table_filter ] - self.dict_old = self.schema_old.model_dump(exclude_none=True, exclude_defaults=True) - self.dict_new = self.schema_new.model_dump(exclude_none=True, exclude_defaults=True) - self._diff = DeepDiff(self.dict_old, self.dict_new, ignore_order=True) + self.dict_old = _normalize_lists_by_name(self.schema_old._model_dump(strip_ids=self.strip_ids)) + self.dict_new = _normalize_lists_by_name(self.schema_new._model_dump(strip_ids=self.strip_ids)) + logger.debug(f"Normalized old dict:\n{json.dumps(self.dict_old, indent=2)}") + logger.debug(f"Normalized new dict:\n{json.dumps(self.dict_new, indent=2)}") + self._diff = DeepDiff( + self.dict_old, + self.dict_new, + ignore_order=True, + view="tree", + ) return self._diff @property @@ -107,7 +162,7 @@ def to_change_list(self) -> list[dict[str, Any]]: list[dict[str, Any]] List of change dictionaries. """ - return [] # Base implementation returns empty list + raise NotImplementedError("Subclasses must implement to_change_list()") def print(self, output_stream: IO[str] | None = None) -> None: """ @@ -133,6 +188,95 @@ def has_changes(self) -> bool: return len(self.diff) > 0 +class DiffHandler: + def collect(self, diff_items: list[DiffLevel]) -> list[dict[str, Any]]: + """Collect differences from the provided diff items. + + Parameters + ---------- + diff_items + The list of differences to collect. + """ + raise NotImplementedError + + +class ValuesChangedHandler(DiffHandler): + def collect(self, diff_items: list[DiffLevel]) -> list[dict[str, Any]]: + results = [] + for diff in diff_items: + results.append( + { + "change_type": diff.report_type, + "path": diff.path(), + "old_value": diff.t1, + "new_value": diff.t2, + } + ) + return results + + +class IterableItemAddedHandler(DiffHandler): + def collect(self, diff_items: list[DiffLevel]) -> list[dict[str, Any]]: + results = [] + for diff in diff_items: + results.append( + { + "change_type": diff.report_type, + "path": diff.path(), + "value": diff.t2, + } + ) + return results + + +class IterableItemRemovedHandler(DiffHandler): + def collect(self, diff_items: list[DiffLevel]) -> list[dict[str, Any]]: + results = [] + for diff in diff_items: + results.append( + { + "change_type": diff.report_type, + "path": diff.path(), + "value": diff.t1, + } + ) + return results + + +class DictionaryItemAddedHandler(DiffHandler): + def collect(self, diff_items: list[DiffLevel]) -> list[dict[str, Any]]: + results = [] + for diff in diff_items: + keys = diff.path(output_format="list") + added_key = keys[-1] if keys else None + results.append( + { + "change_type": diff.report_type, + "path": diff.path(), + "added_key": added_key, + "value": diff.t2, + } + ) + return results + + +class DictionaryItemRemovedHandler(DiffHandler): + def collect(self, diff_items: list[DiffLevel]) -> list[dict[str, Any]]: + results = [] + for diff in diff_items: + keys = diff.path(output_format="list") + removed_key = keys[-1] if keys else None + results.append( + { + "change_type": diff.report_type, + "path": diff.path(), + "removed_key": removed_key, + "value": diff.t1, + } + ) + return results + + class FormattedSchemaDiff(SchemaDiff): """ Compare two schemas using DeepDiff and emit structured JSON differences. @@ -170,6 +314,15 @@ class FormattedSchemaDiff(SchemaDiff): def __init__(self, schema_old: Schema, schema_new: Schema, table_filter: list[str] = []): super().__init__(schema_old, schema_new, table_filter) + # Define a mapping between types of changes and their handlers + self.handlers = { + "values_changed": ValuesChangedHandler(), + "iterable_item_added": IterableItemAddedHandler(), + "iterable_item_removed": IterableItemRemovedHandler(), + "dictionary_item_added": DictionaryItemAddedHandler(), + "dictionary_item_removed": DictionaryItemRemovedHandler(), + } + def to_change_list(self) -> list[dict[str, Any]]: """ Convert differences to a structured format. @@ -181,17 +334,9 @@ def to_change_list(self) -> list[dict[str, Any]]: """ changes = [] - handlers = { - "values_changed": self._collect_values_changed, - "iterable_item_added": self._collect_iterable_item_added, - "iterable_item_removed": self._collect_iterable_item_removed, - "dictionary_item_added": lambda paths: self._collect_dictionary_item_added(paths), - "dictionary_item_removed": self._collect_dictionary_item_removed, - } - - for change_type, handler in handlers.items(): + for change_type, handler in self.handlers.items(): if change_type in self.diff: - changes.extend(handler(self.diff[change_type])) + changes.extend(handler.collect(self.diff[change_type])) return changes @@ -206,186 +351,6 @@ def print(self, output_stream: IO[str] | None = None) -> None: """ print(json.dumps(self.to_change_list(), indent=2), file=output_stream) - def _get_id(self, source_dict: dict, keys: list[str | int]) -> str: - """ - Extract the most relevant ID from the path using `_find_id` and return - it. If no ID is found, return "unknown". - - Parameters - ---------- - keys : list[str | int] - The path to extract the ID from. - - Returns - ------- - str - The extracted ID. - """ - try: - return self._find_id(source_dict, keys) - except ValueError: - return "unknown" - - def _collect_values_changed(self, changes: dict[str, Any]) -> list[dict[str, Any]]: - """Collect value change differences.""" - results = [] - for key in changes: - keys = self._parse_deepdiff_path(key) - results.append( - { - "change_type": "values_changed", - "id": self._get_id(self.dict_old, keys), - "path": self._get_display_path(keys), - "old_value": changes[key]["old_value"], - "new_value": changes[key]["new_value"], - } - ) - return results - - def _collect_iterable_item_added(self, changes: dict[str, Any]) -> list[dict[str, Any]]: - """Collect iterable item addition differences.""" - results = [] - for key in changes: - keys = self._parse_deepdiff_path(key) - results.append( - { - "change_type": "iterable_item_added", - "id": self._get_id(self.dict_new, keys), - "path": self._get_display_path(keys), - "value": changes[key], - } - ) - return results - - def _collect_iterable_item_removed(self, changes: dict[str, Any]) -> list[dict[str, Any]]: - """Collect iterable item removal differences.""" - results = [] - for key in changes: - keys = self._parse_deepdiff_path(key) - results.append( - { - "change_type": "iterable_item_removed", - "id": self._get_id(self.dict_old, keys), - "path": self._get_display_path(keys), - "value": changes[key], - } - ) - return results - - @classmethod - def _get_value_from_key(cls, data: Any, keys: list[str | int]) -> Any: - for key in keys: - data = data[key] # step through nested dicts/lists - return data - - def _collect_dictionary_item_added(self, paths: list[str]) -> list[dict[str, Any]]: - """Collect dictionary item addition differences from DeepDiff path - list. - """ - results = [] - for path in paths: - keys = self._parse_deepdiff_path(path) - added_key = keys[-1] - parent_keys = keys[:-1] - try: - value = self._get_value_from_key(self.dict_new, keys) - except (KeyError, IndexError, TypeError): - logger.warning(f"Could not resolve value for path: {path}") - value = None - results.append( - { - "change_type": "dictionary_item_added", - "id": self._get_id(self.dict_new, keys), - "path": self._get_display_path(parent_keys), - "added_key": added_key, - "value": value, - } - ) - return results - - def _collect_dictionary_item_removed(self, changes: dict[str, Any]) -> list[dict[str, Any]]: - """Collect dictionary item removal differences.""" - results = [] - for key in changes: - keys = self._parse_deepdiff_path(key) - removed_key = keys[-1] - parent_keys = keys[:-1] - results.append( - { - "change_type": "dictionary_item_removed", - "id": self._get_id(self.dict_old, keys), - "path": self._get_display_path(parent_keys), - "removed_key": removed_key, - "value": changes[key], - } - ) - return results - - @staticmethod - def _find_id(values: dict, keys: list[str | int]) -> str: - """Extract the most relevant ID from the path, usually the last 'id' - found. - """ - value: list | dict = values - last_id = None - - for key in keys: - logger.debug(f"Processing key <{key}> with type {type(key)}") - logger.debug(f"Type of value: {type(value)}") - - # Store the ID if current value is a dict with an 'id' field - if isinstance(value, dict) and "id" in value: - last_id = value["id"] - - # Navigate to the next level - if isinstance(value, dict) and key in value: - value = value[key] - elif isinstance(value, list) and isinstance(key, int): - if 0 <= key < len(value): - value = value[key] - else: - raise ValueError(f"Index '{key}' is out of range for list of length {len(value)}") - else: - raise ValueError(f"Key '{key}' not found in value of type {type(value)}") - - if last_id is not None: - return last_id - else: - raise ValueError("No 'id' found in the specified path") - - @staticmethod - def _get_display_path(keys: list[str | int]) -> str: - """Convert keys list to a dot-notation path. - - - If the key is at the root level (e.g., ['description']), return - 'root.description'. - - If keys are empty, return 'root'. - - Otherwise, join keys with dot notation. - """ - if not keys: - return "root" - if len(keys) == 1: - return f"root.{keys[0]}" - return ".".join(str(k) for k in keys) - - @staticmethod - def _parse_deepdiff_path(path: str) -> list[str | int]: - """Parse a DeepDiff path into a list of keys.""" - if path.startswith("root"): - path = path[4:] - - pattern = re.compile(r"\['([^']+)'\]|\[(\d+)\]") - matches = pattern.findall(path) - - keys = [] - for match in matches: - if match[0]: # String key - keys.append(match[0]) - elif match[1]: # Integer index - keys.append(int(match[1])) - - return keys - class DatabaseDiff(SchemaDiff): """ From 05df277627bea703d0d9da9c6c83384785ecca4a Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 10 Jul 2025 12:47:18 -0500 Subject: [PATCH 17/21] Comment out test assertions until fixed This commit skipped the pre-commit checks. --- tests/test_diff.py | 105 +++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 60 deletions(-) diff --git a/tests/test_diff.py b/tests/test_diff.py index 4c1e7e8f..b67d885a 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -45,32 +45,32 @@ def test_schema_diff(self) -> None: schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) schema2 = dm.Schema(name="schema2", id="#schema2", version="4.5.6", description="Schema 2", tables=[]) diff = self._diff(schema1, schema2) - self.assertSetEqual( - set(diff.get("values_changed").keys()), - set(f"root['{key}']" for key in ["name", "id", "version", "description"]), - ) + # self.assertSetEqual( + # set(diff.get("values_changed").keys()), + # set(f"root['{key}']" for key in ["name", "id", "version", "description"]), + # ) # Call formatted handler function - FormattedSchemaDiff(schema1, schema2)._collect_values_changed(diff["values_changed"]) + # FormattedSchemaDiff(schema1, schema2)._collect_values_changed(diff["values_changed"]) # Table added schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) diff = self._diff(schema1, schema2) - self.assertIn("iterable_item_added", diff) - self.assertIn("root['tables'][0]", diff["iterable_item_added"]) + # self.assertIn("iterable_item_added", diff) + # self.assertIn("root['tables'][0]", diff["iterable_item_added"]) # Call formatted handler function - FormattedSchemaDiff(schema1, schema2)._collect_iterable_item_added(diff["iterable_item_added"]) + # FormattedSchemaDiff(schema1, schema2)._collect_iterable_item_added(diff["iterable_item_added"]) # Table removed schema2.tables.clear() schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) diff = self._diff(schema1, schema2) - self.assertIn("iterable_item_removed", diff) - self.assertIn("root['tables'][0]", diff["iterable_item_removed"]) + # self.assertIn("iterable_item_removed", diff) + # self.assertIn("root['tables'][0]", diff["iterable_item_removed"]) # Call formatted handler function - FormattedSchemaDiff(schema1, schema2)._collect_iterable_item_removed(diff["iterable_item_removed"]) + # FormattedSchemaDiff(schema1, schema2)._collect_iterable_item_removed(diff["iterable_item_removed"]) # Different table descriptions schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -78,12 +78,13 @@ def test_schema_diff(self) -> None: schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 2", columns=[])) diff = self._diff(schema1, schema2) - self.assertIn("values_changed", diff) - self.assertIn("root['tables'][0]['description']", diff["values_changed"]) - old_value = diff["values_changed"]["root['tables'][0]['description']"]["old_value"] - new_value = diff["values_changed"]["root['tables'][0]['description']"]["new_value"] - self.assertEqual(old_value, "Table 1") - self.assertEqual(new_value, "Table 2") + + # self.assertIn("values_changed", diff) + # self.assertIn("root['tables'][0]['description']", diff["values_changed"]) + # old_value = diff["values_changed"]["root['tables'][0]['description']"]["old_value"] + # new_value = diff["values_changed"]["root['tables'][0]['description']"]["new_value"] + # self.assertEqual(old_value, "Table 1") + # self.assertEqual(new_value, "Table 2") # Two different tables schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -91,10 +92,10 @@ def test_schema_diff(self) -> None: schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) schema2.tables.append(dm.Table(name="table2", id="#table2", description="Table 2", columns=[])) diff = self._diff(schema1, schema2) - self.assertSetEqual( - set(diff.get("values_changed").keys()), - set(f"root['tables'][0]['{key}']" for key in ["name", "id", "description"]), - ) + # self.assertSetEqual( + # set(diff.get("values_changed").keys()), + # set(f"root['tables'][0]['{key}']" for key in ["name", "id", "description"]), + # ) # Two tables with different columns schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -104,9 +105,9 @@ def test_schema_diff(self) -> None: schema2.tables[0].columns.append( dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") ) - diff = self._diff(schema1, schema2) - self.assertIn("iterable_item_added", diff) - self.assertIn("root['tables'][0]['columns'][0]", diff["iterable_item_added"]) + # diff = self._diff(schema1, schema2) + # self.assertIn("iterable_item_added", diff) + # self.assertIn("root['tables'][0]['columns'][0]", diff["iterable_item_added"]) # Same columns in different order (no diff) schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -126,7 +127,7 @@ def test_schema_diff(self) -> None: dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") ) diff = self._diff(schema1, schema2) - self.assertEqual(len(diff), 0) + # self.assertEqual(len(diff), 0) # Same columns with different descriptions schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -140,12 +141,12 @@ def test_schema_diff(self) -> None: dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 2") ) diff = self._diff(schema1, schema2) - self.assertIn("values_changed", diff) - self.assertIn("root['tables'][0]['columns'][0]['description']", diff["values_changed"]) - old_value = diff["values_changed"]["root['tables'][0]['columns'][0]['description']"]["old_value"] - new_value = diff["values_changed"]["root['tables'][0]['columns'][0]['description']"]["new_value"] - self.assertEqual(old_value, "Column 1") - self.assertEqual(new_value, "Column 2") + # self.assertIn("values_changed", diff) + # self.assertIn("root['tables'][0]['columns'][0]['description']", diff["values_changed"]) + # old_value = diff["values_changed"]["root['tables'][0]['columns'][0]['description']"]["old_value"] + # new_value = diff["values_changed"]["root['tables'][0]['columns'][0]['description']"]["new_value"] + # self.assertEqual(old_value, "Column 1") + # self.assertEqual(new_value, "Column 2") # Added a field to a column schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -166,12 +167,8 @@ def test_schema_diff(self) -> None: ) ) diff = self._diff(schema1, schema2) - self.assertIn("dictionary_item_added", diff) - self.assertIn("root['tables'][0]['columns'][0]['ivoa_ucd']", diff["dictionary_item_added"]) - - # Call formatted handler function - print("Diff:", diff) - FormattedSchemaDiff(schema1, schema2)._collect_dictionary_item_added(diff["dictionary_item_added"]) + # self.assertIn("dictionary_item_added", diff) + # self.assertIn("root['tables'][0]['columns'][0]['ivoa_ucd']", diff["dictionary_item_added"]) # Removed a field from a column schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -192,13 +189,8 @@ def test_schema_diff(self) -> None: dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") ) diff = self._diff(schema1, schema2) - self.assertIn("dictionary_item_removed", diff) - self.assertIn("root['tables'][0]['columns'][0]['ivoa_ucd']", diff["dictionary_item_removed"]) - - # Call formatted handler function - FormattedSchemaDiff(schema1, schema2)._collect_dictionary_item_removed( - diff["dictionary_item_removed"] - ) + # self.assertIn("dictionary_item_removed", diff) + # self.assertIn("root['tables'][0]['columns'][0]['ivoa_ucd']", diff["dictionary_item_removed"]) def test_index_diff(self) -> None: """Test differences in indices between tables.""" @@ -220,15 +212,12 @@ def test_index_diff(self) -> None: dm.Index(name="index1", id="#index1", description="Index 1", columns=["column2"]) ) diff = self._diff(schema1, schema2) - self.assertIn("values_changed", diff) - self.assertIn("root['tables'][0]['indexes'][0]['columns'][0]", diff["values_changed"]) - new_value = diff["values_changed"]["root['tables'][0]['indexes'][0]['columns'][0]"]["new_value"] - old_value = diff["values_changed"]["root['tables'][0]['indexes'][0]['columns'][0]"]["old_value"] - self.assertEqual(old_value, "column1") - self.assertEqual(new_value, "column2") - - # Print the diff to ensure it works without errors - FormattedSchemaDiff(schema1, schema2).print() + # self.assertIn("values_changed", diff) + # self.assertIn("root['tables'][0]['indexes'][0]['columns'][0]", diff["values_changed"]) + # new_value = diff["values_changed"]["root['tables'][0]['indexes'][0]['columns'][0]"]["new_value"] + # old_value = diff["values_changed"]["root['tables'][0]['indexes'][0]['columns'][0]"]["old_value"] + # self.assertEqual(old_value, "column1") + # self.assertEqual(new_value, "column2") def test_print(self) -> None: schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) @@ -240,11 +229,6 @@ def test_formatted_print(self) -> None: schema2 = dm.Schema(name="schema2", id="#schema2", version="4.5.6", description="Schema 2", tables=[]) FormattedSchemaDiff(schema1, schema2).print() - def test_parse_deepdiff_path(self) -> None: - path = "root['tables'][0]['columns'][0]['ivoa_ucd']" - keys = FormattedSchemaDiff._parse_deepdiff_path(path) - self.assertListEqual(keys, ["tables", 0, "columns", 0, "ivoa_ucd"]) - def test_table_filter(self) -> None: schema1 = dm.Schema( name="schema1", @@ -273,9 +257,10 @@ def test_table_filter(self) -> None: diff = SchemaDiff(schema1, schema2, table_filter=["table2"]).diff self.assertEqual(len(diff), 0) + # FIXME diff = SchemaDiff(schema1, schema2, table_filter=["table3"]).diff - self.assertEqual(len(diff), 1) - self.assertTrue("iterable_item_removed" in diff) + # self.assertEqual(len(diff), 1) + # self.assertTrue("iterable_item_removed" in diff) class TestDatabaseDiff(unittest.TestCase): From 930dcc9f0b90668816a5db3fda640567a33cfa91 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Sat, 12 Jul 2025 13:47:22 -0500 Subject: [PATCH 18/21] Turn on id generation to fix several cli tests --- tests/test_cli.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 55c7b615..cc589906 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -145,7 +145,9 @@ def test_diff_database(self) -> None: test_diff2 = os.path.join(TEST_DIR, "data", "test_diff2.yaml") engine = create_engine(self.sqlite_url) - metadata_db = MetaDataBuilder(Schema.from_uri(test_diff1), apply_schema_to_metadata=False).build() + metadata_db = MetaDataBuilder( + Schema.from_uri(test_diff1, context={"id_generation": True}), apply_schema_to_metadata=False + ).build() metadata_db.create_all(engine) run_cli(["diff", f"--engine-url={self.sqlite_url}", test_diff2]) @@ -159,7 +161,9 @@ def test_diff_database_with_table_filter(self) -> None: db_url = f"sqlite:///{self.tmpdir}/tap_schema.sqlite3" engine = create_engine(db_url) - metadata_db = MetaDataBuilder(Schema.from_uri(test_diff1), apply_schema_to_metadata=False).build() + metadata_db = MetaDataBuilder( + Schema.from_uri(test_diff1, context={"id_generation": True}), apply_schema_to_metadata=False + ).build() metadata_db.create_all(engine) run_cli( From 4c5bcbcab3091a5b3c7d2ee837d27334cbd00b63 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 24 Jul 2025 13:51:10 -0500 Subject: [PATCH 19/21] Add back deepcopy to avoid altering the original objects --- python/felis/diff.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/felis/diff.py b/python/felis/diff.py index 427e84a5..a0107e32 100644 --- a/python/felis/diff.py +++ b/python/felis/diff.py @@ -21,6 +21,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import copy import json import logging from typing import IO, Any @@ -113,8 +114,8 @@ def __init__( table_filter: list[str] | None = None, strip_ids: bool = True, ): - self.schema_old = schema_old - self.schema_new = schema_new + self.schema_old = copy.deepcopy(schema_old) + self.schema_new = copy.deepcopy(schema_new) if table_filter: logger.debug(f"Filtering on tables: {table_filter}") self.table_filter = table_filter or [] @@ -126,9 +127,11 @@ def _create_diff(self) -> dict[str, Any]: self.schema_old.tables = [ table for table in self.schema_old.tables if table.name in self.table_filter ] + logger.debug(f"Filtered old schema tables: {[table.name for table in self.schema_old.tables]}") self.schema_new.tables = [ table for table in self.schema_new.tables if table.name in self.table_filter ] + logger.debug(f"Filtered new schema tables: {[table.name for table in self.schema_new.tables]}") self.dict_old = _normalize_lists_by_name(self.schema_old._model_dump(strip_ids=self.strip_ids)) self.dict_new = _normalize_lists_by_name(self.schema_new._model_dump(strip_ids=self.strip_ids)) logger.debug(f"Normalized old dict:\n{json.dumps(self.dict_old, indent=2)}") From 1a7188fb28081e20db3b25b8161882b5979f316a Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 24 Jul 2025 13:52:29 -0500 Subject: [PATCH 20/21] Update and fix tests of diff module --- tests/test_diff.py | 441 +++++++++++++++++++++++++-------------------- 1 file changed, 247 insertions(+), 194 deletions(-) diff --git a/tests/test_diff.py b/tests/test_diff.py index b67d885a..85067e75 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -19,145 +19,181 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import os import unittest from typing import Any from sqlalchemy import create_engine -from felis import Schema -from felis import datamodel as dm +from felis import Column, Index, Schema, Table from felis.diff import DatabaseDiff, FormattedSchemaDiff, SchemaDiff from felis.metadata import MetaDataBuilder -class TestSchemaDiff(unittest.TestCase): +class SchemaDiffTestCase(unittest.TestCase): """Test the SchemaDiff class.""" - def _diff(self, schema1: Schema, schema2: Schema) -> dict[str, Any]: - return SchemaDiff(schema1, schema2).diff + def _diff( + self, print_diff: bool = False, label: str = "", table_filter: list[str] | None = None + ) -> dict[str, Any]: + """Generate a diff between the two schemas managed by the test case, + optionally printing the differences to the console for debugging + purposes. + """ + diff = SchemaDiff(self.sch1, self.sch2, table_filter=table_filter).diff + if print_diff: + print(label, "diff:", diff) + return diff + + def setUp(self) -> None: + """Set up two schemas for testing.""" + self.sch1: Schema = Schema( + name="schema", id="#schema", version="1.0.0", description="Schema", tables=[] + ) + self.sch2: Schema = Schema( + name="schema", id="#schema", version="1.0.0", description="Schema", tables=[] + ) - def test_schema_diff(self) -> None: - """Test the comparison output generated by the SchemaDiff class. - Since the handler functions are called by dispatch in the SchemaDiff - class, we activate them manually here to ensure test coverage. + def test_schema_changed(self) -> None: + """Test comparison of schemas with different attribute values.""" + self.sch2.name = "schema2" + self.sch2.version = "1.0.1" + self.sch2.description = "Schema 2" + diff = self._diff() + self.assertSetEqual( + set(diff_level.path() for diff_level in diff["values_changed"]), + set(f"root['{key}']" for key in ["name", "version", "description"]), + ) + + def test_table_added(self) -> None: + """Test the addition of a table to a schema. Because of how the data is + restructured for comparison, a table addition will show up as a + dictionary item added. + """ + self.sch2.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + diff = self._diff() + self.assertIn("dictionary_item_added", diff) + self.assertEqual(diff["dictionary_item_added"][0].path(), "root['tables']['table1']") + + def test_table_removed(self) -> None: + """Test the removal of a table from a schema. Because of how the data + is restructured for comparison, a table removal will show up as a + dictionary item removed. """ - # Two schemas with different values - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema2", id="#schema2", version="4.5.6", description="Schema 2", tables=[]) - diff = self._diff(schema1, schema2) - # self.assertSetEqual( - # set(diff.get("values_changed").keys()), - # set(f"root['{key}']" for key in ["name", "id", "version", "description"]), - # ) - - # Call formatted handler function - # FormattedSchemaDiff(schema1, schema2)._collect_values_changed(diff["values_changed"]) - - # Table added - schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - diff = self._diff(schema1, schema2) - # self.assertIn("iterable_item_added", diff) - # self.assertIn("root['tables'][0]", diff["iterable_item_added"]) - - # Call formatted handler function - # FormattedSchemaDiff(schema1, schema2)._collect_iterable_item_added(diff["iterable_item_added"]) - - # Table removed - schema2.tables.clear() - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - diff = self._diff(schema1, schema2) - # self.assertIn("iterable_item_removed", diff) - # self.assertIn("root['tables'][0]", diff["iterable_item_removed"]) - - # Call formatted handler function - # FormattedSchemaDiff(schema1, schema2)._collect_iterable_item_removed(diff["iterable_item_removed"]) - - # Different table descriptions - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 2", columns=[])) - diff = self._diff(schema1, schema2) - - # self.assertIn("values_changed", diff) - # self.assertIn("root['tables'][0]['description']", diff["values_changed"]) - # old_value = diff["values_changed"]["root['tables'][0]['description']"]["old_value"] - # new_value = diff["values_changed"]["root['tables'][0]['description']"]["new_value"] - # self.assertEqual(old_value, "Table 1") - # self.assertEqual(new_value, "Table 2") - - # Two different tables - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema2.tables.append(dm.Table(name="table2", id="#table2", description="Table 2", columns=[])) - diff = self._diff(schema1, schema2) - # self.assertSetEqual( - # set(diff.get("values_changed").keys()), - # set(f"root['tables'][0]['{key}']" for key in ["name", "id", "description"]), - # ) + self.sch1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + diff = self._diff() + # Because of how the data is restructured for comparison, a table + # removal will show up as a dictionary item removed. + self.assertIn("dictionary_item_removed", diff) + self.assertEqual(diff["dictionary_item_removed"][0].path(), "root['tables']['table1']") + + def test_table_descriptions_changed(self) -> None: + """Test the change of a table's description.""" + self.sch1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch2.tables.append(Table(name="table1", id="#table1", description="Table 1 changed", columns=[])) + diff = self._diff() + + self.assertIn("values_changed", diff) + values_changed = diff["values_changed"][0] + self.assertEqual(values_changed.path(), "root['tables']['table1']['description']") + self.assertEqual(values_changed.t1, "Table 1") + self.assertEqual(values_changed.t2, "Table 1 changed") + + def test_tables_changed(self) -> None: + """Test schemas with different tables.""" + self.sch1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch2.tables.append(Table(name="table2", id="#table2", description="Table 2", columns=[])) + diff = self._diff() + + values_changed = diff["values_changed"][0] + self.assertEqual(values_changed.path(), "root['tables']") + self.assertEqual( + values_changed.t1, {"table1": {"name": "table1", "description": "Table 1", "columns": {}}} + ) + self.assertEqual( + values_changed.t2, {"table2": {"name": "table2", "description": "Table 2", "columns": {}}} + ) + def test_columns_changed(self) -> None: # Two tables with different columns - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema2.tables[0].columns.append( - dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + self.sch1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch2.tables.append( + Table( + name="table1", + id="#table1", + description="Table 1", + columns=[ + Column( + name="column1", id="#column1", datatype="string", length=256, description="Column 1" + ) + ], + ) ) - # diff = self._diff(schema1, schema2) - # self.assertIn("iterable_item_added", diff) - # self.assertIn("root['tables'][0]['columns'][0]", diff["iterable_item_added"]) - - # Same columns in different order (no diff) - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema1.tables[0].columns.append( - dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + diff = self._diff() + dictionary_item_added = diff["dictionary_item_added"][0] + self.assertEqual(dictionary_item_added.path(), "root['tables']['table1']['columns']['column1']") + self.assertEqual(str(dictionary_item_added.t1), "not present") + self.assertEqual( + dictionary_item_added.t2, + { + "name": "column1", + "description": "Column 1", + "datatype": "string", + "length": 256, + "votable:arraysize": "256*", + }, ) - schema1.tables[0].columns.append( - dm.Column(name="column2", datatype="string", length=256, id="#column2", description="Column 2") + + def test_column_order_changed(self) -> None: + """Test the same columns in a different order. This should not be + considered a difference. + """ + self.sch1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch2.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch1.tables[0].columns.append( + Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") ) - schema2.tables[0].columns.append( - dm.Column(name="column2", datatype="string", length=256, id="#column2", description="Column 2") + self.sch1.tables[0].columns.append( + Column(name="column2", datatype="string", length=256, id="#column2", description="Column 2") ) - schema2.tables[0].columns.append( - dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + self.sch2.tables[0].columns.append( + Column(name="column2", datatype="string", length=256, id="#column2", description="Column 2") ) - diff = self._diff(schema1, schema2) - # self.assertEqual(len(diff), 0) - - # Same columns with different descriptions - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema1.tables[0].columns.append( - dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + self.sch2.tables[0].columns.append( + Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") ) - schema2.tables[0].columns.append( - dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 2") + diff = self._diff() + self.assertEqual(len(diff), 0) + + def test_column_description_changed(self) -> None: + """Test the change of a column's description.""" + self.sch1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch2.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch1.tables[0].columns.append( + Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") ) - diff = self._diff(schema1, schema2) - # self.assertIn("values_changed", diff) - # self.assertIn("root['tables'][0]['columns'][0]['description']", diff["values_changed"]) - # old_value = diff["values_changed"]["root['tables'][0]['columns'][0]['description']"]["old_value"] - # new_value = diff["values_changed"]["root['tables'][0]['columns'][0]['description']"]["new_value"] - # self.assertEqual(old_value, "Column 1") - # self.assertEqual(new_value, "Column 2") - - # Added a field to a column - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema1.tables[0].columns.append( - dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + self.sch2.tables[0].columns.append( + Column(name="column1", datatype="string", length=256, id="#column1", description="Column 2") ) - schema2.tables[0].columns.append( - dm.Column( + diff = self._diff() + print("test_column_descriptions_changed diff:", diff) + self.assertIn("values_changed", diff) + values_changed = diff["values_changed"][0] + self.assertEqual( + values_changed.path(), "root['tables']['table1']['columns']['column1']['description']" + ) + self.assertEqual(values_changed.t1, "Column 1") + self.assertEqual(values_changed.t2, "Column 2") + + def test_field_added_to_column(self) -> None: + """Test the addition of a field to a column.""" + self.sch1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch2.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch1.tables[0].columns.append( + Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + ) + self.sch2.tables[0].columns.append( + Column( name="column1", datatype="string", length=256, @@ -166,17 +202,22 @@ def test_schema_diff(self) -> None: ivoa_ucd="meta.id;src;meta.main ", ) ) - diff = self._diff(schema1, schema2) - # self.assertIn("dictionary_item_added", diff) - # self.assertIn("root['tables'][0]['columns'][0]['ivoa_ucd']", diff["dictionary_item_added"]) - - # Removed a field from a column - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema1.tables[0].columns.append( - dm.Column( + diff = self._diff() + print("test_field_added_to_column diff:", diff) + self.assertIn("dictionary_item_added", diff) + dictionary_item_added = diff["dictionary_item_added"][0] + self.assertEqual( + dictionary_item_added.path(), "root['tables']['table1']['columns']['column1']['ivoa:ucd']" + ) + self.assertEqual(str(dictionary_item_added.t1), "not present") + self.assertEqual(dictionary_item_added.t2, "meta.id;src;meta.main") + + def test_field_removed_from_column(self) -> None: + """Test the removal of a field from a column.""" + self.sch1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch2.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch1.tables[0].columns.append( + Column( name="column1", datatype="string", length=256, @@ -185,103 +226,115 @@ def test_schema_diff(self) -> None: ivoa_ucd="meta.id;src;meta.main ", ) ) - schema2.tables[0].columns.append( - dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + self.sch2.tables[0].columns.append( + Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + ) + diff = self._diff() + print("test_field_removed_from_column diff:", diff) + + self.assertIn("dictionary_item_removed", diff) + dictionary_item_removed = diff["dictionary_item_removed"][0] + self.assertEqual( + dictionary_item_removed.path(), "root['tables']['table1']['columns']['column1']['ivoa:ucd']" ) - diff = self._diff(schema1, schema2) - # self.assertIn("dictionary_item_removed", diff) - # self.assertIn("root['tables'][0]['columns'][0]['ivoa_ucd']", diff["dictionary_item_removed"]) + self.assertEqual(dictionary_item_removed.t1, "meta.id;src;meta.main") + self.assertEqual(str(dictionary_item_removed.t2), "not present") - def test_index_diff(self) -> None: + def test_index_columns_changed(self) -> None: """Test differences in indices between tables.""" - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema1.tables[0].columns.append( - dm.Column(name="column1", datatype="int", id="#column1", description="Column 1") + self.sch1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch1.tables[0].columns.append( + Column(name="column1", datatype="int", id="#column1", description="Column 1") ) - schema1.tables[0].indexes.append( - dm.Index(name="index1", id="#index1", description="Index 1", columns=["column1"]) + self.sch1.tables[0].indexes.append( + Index(name="index1", id="#index1", description="Index 1", columns=["column1"]) ) - - schema2 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) - schema2.tables[0].columns.append( - dm.Column(name="column2", datatype="int", id="#column2", description="Column 2") + self.sch2.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) + self.sch2.tables[0].columns.append( + Column(name="column2", datatype="int", id="#column2", description="Column 2") ) - schema2.tables[0].indexes.append( - dm.Index(name="index1", id="#index1", description="Index 1", columns=["column2"]) + self.sch2.tables[0].indexes.append( + Index(name="index1", id="#index1", description="Index 1", columns=["column2"]) ) - diff = self._diff(schema1, schema2) - # self.assertIn("values_changed", diff) - # self.assertIn("root['tables'][0]['indexes'][0]['columns'][0]", diff["values_changed"]) - # new_value = diff["values_changed"]["root['tables'][0]['indexes'][0]['columns'][0]"]["new_value"] - # old_value = diff["values_changed"]["root['tables'][0]['indexes'][0]['columns'][0]"]["old_value"] - # self.assertEqual(old_value, "column1") - # self.assertEqual(new_value, "column2") - - def test_print(self) -> None: - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema2", id="#schema2", version="4.5.6", description="Schema 2", tables=[]) + diff = self._diff(print_diff=True, label="test_index_diff") + values_changed = diff["values_changed"][1] + self.assertEqual(values_changed.path(), "root['tables']['table1']['indexes']['index1']['columns'][0]") + self.assertEqual(values_changed.t1, "column1") + self.assertEqual(values_changed.t2, "column2") + + def test_schema_diff_print(self) -> None: + schema1 = Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) + schema2 = Schema(name="schema2", id="#schema2", version="4.5.6", description="Schema 2", tables=[]) SchemaDiff(schema1, schema2).print() - def test_formatted_print(self) -> None: - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2 = dm.Schema(name="schema2", id="#schema2", version="4.5.6", description="Schema 2", tables=[]) - FormattedSchemaDiff(schema1, schema2).print() - def test_table_filter(self) -> None: - schema1 = dm.Schema( - name="schema1", - id="#schema1", - description="Schema 1", - tables=[ - dm.Table(name="table1", id="#table1", description="Table 1", columns=[]), - dm.Table(name="table2", id="#table2", description="Table 2", columns=[]), - dm.Table(name="table3", id="#table3", description="Table 3", columns=[]), - ], + self.sch1.tables.extend( + [ + Table(name="table1", id="#table1", description="Table 1", columns=[]), + Table(name="table2", id="#table2", description="Table 2", columns=[]), + Table(name="table3", id="#table3", description="Table 3", columns=[]), + ] ) - - schema2 = dm.Schema( - name="schema1", - id="#schema1", - description="Schema 1", - tables=[ - dm.Table(name="table1", id="#table1", description="Table 1", columns=[]), - dm.Table(name="table2", id="#table2", description="Table 2", columns=[]), - ], + self.sch2.tables.extend( + [ + Table(name="table1", id="#table1", description="Table 1", columns=[]), + Table(name="table2", id="#table2", description="Table 2", columns=[]), + ] ) - diff = SchemaDiff(schema1, schema2, table_filter=["table1"]).diff + diff = self._diff(table_filter=["table1"]) self.assertEqual(len(diff), 0) - diff = SchemaDiff(schema1, schema2, table_filter=["table2"]).diff + diff = self._diff(table_filter=["table2"]) self.assertEqual(len(diff), 0) - # FIXME - diff = SchemaDiff(schema1, schema2, table_filter=["table3"]).diff - # self.assertEqual(len(diff), 1) - # self.assertTrue("iterable_item_removed" in diff) + diff = self._diff(table_filter=["table3"], print_diff=True, label="test_table_filter") + dictionary_item_removed = diff["dictionary_item_removed"][0] + self.assertEqual(dictionary_item_removed.path(), "root['tables']['table3']") + self.assertEqual( + str(dictionary_item_removed.t1), "{'name': 'table3', 'description': 'Table 3', 'columns': {}}" + ) + self.assertEqual(str(dictionary_item_removed.t2), "not present") + + +class FormattedSchemaDiffTestCase(unittest.TestCase): + """Test the FormattedSchemaDiff class.""" + + def test_formatted_diff_print(self) -> None: + """Test the formatted output of the SchemaDiff by printing the + differences between two YAML schema files files. + """ + test_dir = os.path.abspath(os.path.dirname(__file__)) + test_diff1_path = os.path.join(test_dir, "data", "test_diff1.yaml") + test_diff2_path = os.path.join(test_dir, "data", "test_diff2.yaml") + + context = {"id_generation": True} + sch1 = Schema.from_uri(test_diff1_path, context=context) + sch2 = Schema.from_uri(test_diff2_path, context=context) + + formatted_diff = FormattedSchemaDiff(sch1, sch2) + formatted_diff.print() -class TestDatabaseDiff(unittest.TestCase): +class DatabaseDiffTestCase(unittest.TestCase): """Test the DatabaseDiff class.""" def test_database_diff(self) -> None: """Test the comparison output generated by the DatabaseDiff class.""" # Two tables with different columns - schema1 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema1.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) + schema1 = Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) + schema1.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) schema1.tables[0].columns.append( - dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") ) - schema2 = dm.Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) - schema2.tables.append(dm.Table(name="table1", id="#table1", description="Table 1", columns=[])) + schema2 = Schema(name="schema1", id="#schema1", version="1.2.3", description="Schema 1", tables=[]) + schema2.tables.append(Table(name="table1", id="#table1", description="Table 1", columns=[])) schema2.tables[0].columns.append( - dm.Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") + Column(name="column1", datatype="string", length=256, id="#column1", description="Column 1") ) schema2.tables[0].columns.append( - dm.Column(name="column2", datatype="string", length=256, id="#column2", description="Column 2") + Column(name="column2", datatype="string", length=256, id="#column2", description="Column 2") ) metadata_db = MetaDataBuilder(schema1, apply_schema_to_metadata=False).build() From 960d026f0348cf63adfa97915a2f15d018e4a9d7 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 24 Jul 2025 13:59:14 -0500 Subject: [PATCH 21/21] Add news fragment --- docs/changes/DM-49446.feature.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/changes/DM-49446.feature.rst diff --git a/docs/changes/DM-49446.feature.rst b/docs/changes/DM-49446.feature.rst new file mode 100644 index 00000000..43f392e1 --- /dev/null +++ b/docs/changes/DM-49446.feature.rst @@ -0,0 +1,6 @@ +The ``diff`` module was completely overhauled based on best practices for using the ``deepdiff`` library. +The formatted differences are now displayed in a JSON format, which is more readable and structured than the previous print outs. +A number of bugs and issues with the comparison logic and the display of the diffs were also fixed, improving the accuracy of the schema diffs. +The error handling in the database diff class was improved so that exception messages should be more informative and helpful. +A command line option has been added for writing the diff output to a file. +Finally, an additional command line option allows filtering only on specified tables in the two schemas being compared.