From 476b310c415d977942c5ab3aaddb5de15b574968 Mon Sep 17 00:00:00 2001 From: Andrzej Swatowski Date: Mon, 7 Feb 2022 17:04:17 +0100 Subject: [PATCH 1/6] Add dp lint command --- CHANGELOG.md | 4 + data_pipelines_cli/cli.py | 2 + data_pipelines_cli/cli_commands/lint.py | 144 +++++++++++++++ data_pipelines_cli/errors.py | 10 ++ .../data_pipelines_cli.cli_commands.rst | 9 +- setup.py | 4 + tests/cli_commands/test_lint.py | 166 ++++++++++++++++++ 7 files changed, 338 insertions(+), 1 deletion(-) create mode 100644 data_pipelines_cli/cli_commands/lint.py create mode 100644 tests/cli_commands/test_lint.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e716ce..b63cc81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Added + +- `dp lint` command that uses [SQLFluff](https://github.com/sqlfluff/sqlfluff) to lint models and tests + ## [0.19.0] - 2022-04-25 ### Added diff --git a/data_pipelines_cli/cli.py b/data_pipelines_cli/cli.py index c6845e9..fc52a0c 100644 --- a/data_pipelines_cli/cli.py +++ b/data_pipelines_cli/cli.py @@ -9,6 +9,7 @@ from .cli_commands.docs import docs_command from .cli_commands.generate.generate import generate_group from .cli_commands.init import init_command +from .cli_commands.lint import lint_command from .cli_commands.prepare_env import prepare_env_command from .cli_commands.publish import publish_command from .cli_commands.run import run_command @@ -43,6 +44,7 @@ def cli() -> None: _cli.add_command(docs_command) _cli.add_command(generate_group) _cli.add_command(init_command) +_cli.add_command(lint_command) _cli.add_command(list_templates_command) _cli.add_command(prepare_env_command) _cli.add_command(publish_command) diff --git a/data_pipelines_cli/cli_commands/lint.py b/data_pipelines_cli/cli_commands/lint.py new file mode 100644 index 0000000..0cef69c --- /dev/null +++ b/data_pipelines_cli/cli_commands/lint.py @@ -0,0 +1,144 @@ +import pathlib +import tempfile +from configparser import ConfigParser +from typing import List + +import click +import yaml + +from ..cli_constants import BUILD_DIR +from ..cli_utils import echo_info, echo_subinfo, echo_warning, subprocess_run +from ..config_generation import ( + generate_profiles_yml, + read_dictionary_from_config_directory, +) +from ..errors import SQLLintError, SubprocessNonZeroExitError + +SQLFLUFF_FIX_NOT_EVERYTHING_ERROR = 1 +SQLFLUFF_LINT_ERROR = 65 # according to `sqlfluff.core.linter.LintingResult.stats` +SQLFLUFF_DIALECT_LOADING_ERROR = 66 # according to `sqlfluff.cli.commands.get_config` + + +def _get_dialect_or_default() -> str: + """Read ``dbt.yml`` config file and return its ``target_type`` or just the ``ansi``.""" + env, dbt_filename = "base", "dbt.yml" + dbt_env_config = read_dictionary_from_config_directory( + BUILD_DIR.joinpath("dag"), env, dbt_filename + ) or read_dictionary_from_config_directory(pathlib.Path.cwd(), env, dbt_filename) + try: + dialect = dbt_env_config["target_type"] + echo_subinfo(f'Found target_type "{dialect}", attempting to use it as the SQL dialect.') + except KeyError: + dialect = "ansi" + echo_warning( + 'Could not find `target_type` in `dbt.yml`. Using the default SQL dialect ("ansi").' + ) + return dialect + + +def _get_source_tests_paths() -> List[pathlib.Path]: + with open(pathlib.Path.cwd().joinpath("dbt_project.yml"), "r") as f: + dbt_project_config = yaml.safe_load(f) + dir_names: List[str] = ( + dbt_project_config.get("source-paths", []) + + dbt_project_config.get("model-paths", []) + + dbt_project_config.get("test-paths", []) + ) + return list(map(lambda dir_name: pathlib.Path.cwd().joinpath(dir_name), dir_names)) + + +def _create_temporary_sqlfluff_config(env: str) -> ConfigParser: + config = ConfigParser() + config["sqlfluff"] = {"templater": "dbt"} + config["sqlfluff:templater:dbt"] = { + "profiles_dir": str(generate_profiles_yml(env, copy_config_dir=True).absolute()) + } + return config + + +def _run_sqlfluff(command: str, dialect: str, env: str, additional_args: List[str]) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_config_path = pathlib.Path(tmp_dir).joinpath("sqlfluff.config") + with open(tmp_config_path, "w") as tmp_config: + _create_temporary_sqlfluff_config(env).write(tmp_config) + + def sqlfluff_args(sql_dialect: str) -> List[str]: + return [ + "sqlfluff", + command, + "--dialect", + sql_dialect, + "--config", + str(tmp_config_path), + *additional_args, + *map(str, _get_source_tests_paths()), + ] + + try: + subprocess_run(sqlfluff_args(dialect)) + except SubprocessNonZeroExitError as err: + if err.exit_code == SQLFLUFF_DIALECT_LOADING_ERROR and dialect != "ansi": + subprocess_run(sqlfluff_args("ansi")) + else: + raise err + + +def _run_fix_sqlfluff(dialect: str, env: str) -> None: + try: + echo_subinfo("Attempting to fix SQLs. Not every error can be automatically fixed.") + _run_sqlfluff("fix", dialect, env, ["--force"]) + except SubprocessNonZeroExitError as err: + if err.exit_code != SQLFLUFF_FIX_NOT_EVERYTHING_ERROR: + raise err + + +def _run_lint_sqlfluff(dialect: str, env: str) -> None: + try: + echo_subinfo("Linting SQLs.") + _run_sqlfluff("lint", dialect, env, []) + except SubprocessNonZeroExitError as err: + if err.exit_code == SQLFLUFF_LINT_ERROR: + raise SQLLintError + else: + raise err + + +def lint(fix: bool, env: str) -> None: + """ + Lint and format SQL. + + :param fix: Whether to lint and fix linting errors, or just lint. + :type fix: bool + :param env: Name of the environment + :type env: str + """ + echo_info("Linting SQLs:") + dialect = _get_dialect_or_default() + if fix: + _run_fix_sqlfluff(dialect, env) + _run_lint_sqlfluff(dialect, env) + + +@click.command( + name="lint", + short_help="Lint and format SQL", + help="Lint and format SQL using SQLFluff.\n\n" + "For more information on rules and the workings of SQLFluff, " + "refer to https://docs.sqlfluff.com/", +) +@click.option( + "--no-fix", + is_flag=True, + default=False, + type=bool, + help="Whether to lint and fix linting errors, or just lint.", +) +@click.option( + "--env", + default="local", + type=str, + show_default=True, + help="Name of the environment", +) +def lint_command(no_fix: bool, env: str) -> None: + lint(not no_fix, env) diff --git a/data_pipelines_cli/errors.py b/data_pipelines_cli/errors.py index 0e4ef39..f16a77c 100644 --- a/data_pipelines_cli/errors.py +++ b/data_pipelines_cli/errors.py @@ -44,6 +44,8 @@ def __init__(self, project_path: str) -> None: class SubprocessNonZeroExitError(DataPipelinesError): """Exception raised if subprocess exits with non-zero exit code""" + exit_code: int + def __init__( self, subprocess_name: str, exit_code: int, subprocess_output: Optional[str] = None ) -> None: @@ -51,6 +53,7 @@ def __init__( f"{subprocess_name} has exited with non-zero exit code: {exit_code}", submessage=subprocess_output, ) + self.exit_code = exit_code class SubprocessNotFound(DataPipelinesError): @@ -91,3 +94,10 @@ class DockerErrorResponseError(DataPipelinesError): def __init__(self, error_msg: str) -> None: super().__init__("Error raised when using Docker.\n" + error_msg) + + +class SQLLintError(DataPipelinesError): + """Exception raised if there are linting problems in some files.""" + + def __init__(self) -> None: + super().__init__("Fix SQL linting errors.") diff --git a/docs/source/data_pipelines_cli.cli_commands.rst b/docs/source/data_pipelines_cli.cli_commands.rst index a86ebfe..6faebfa 100644 --- a/docs/source/data_pipelines_cli.cli_commands.rst +++ b/docs/source/data_pipelines_cli.cli_commands.rst @@ -64,6 +64,14 @@ data\_pipelines\_cli.cli\_commands.init module :undoc-members: :show-inheritance: +data\_pipelines\_cli.cli\_commands.lint module +---------------------------------------------- + +.. automodule:: data_pipelines_cli.cli_commands.lint + :members: + :undoc-members: + :show-inheritance: + data\_pipelines\_cli.cli\_commands.prepare\_env module ------------------------------------------------------ @@ -119,4 +127,3 @@ data\_pipelines\_cli.cli\_commands.update module :members: :undoc-members: :show-inheritance: - diff --git a/setup.py b/setup.py index d7b0b2f..6b6dcc8 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,10 @@ "docker": ["docker>=5.0"], "datahub": ["acryl-datahub>=0.8.17, <0.8.18"], "git": ["GitPython==3.1.26"], + "lint": [ + "sqlfluff==0.11.1", + "sqlfluff-templater-dbt==0.11.1", + ], "tests": [ "pytest>=6.2.2, <7.0.0", "pytest-cov>=2.8.0, <3.0.0", diff --git a/tests/cli_commands/test_lint.py b/tests/cli_commands/test_lint.py new file mode 100644 index 0000000..f723535 --- /dev/null +++ b/tests/cli_commands/test_lint.py @@ -0,0 +1,166 @@ +import pathlib +import shutil +import tempfile +import unittest +from typing import List +from unittest.mock import MagicMock, patch + +import yaml +from click.testing import CliRunner + +from data_pipelines_cli.cli import _cli +from data_pipelines_cli.cli_commands.lint import ( + _get_dialect_or_default, + _get_source_tests_paths, +) +from data_pipelines_cli.errors import SQLLintError, SubprocessNonZeroExitError + +goldens_dir_path = pathlib.Path(__file__).parent.parent.joinpath("goldens") + + +@patch("data_pipelines_cli.cli_commands.lint.generate_profiles_yml", MagicMock()) +class LintCommandTestCase(unittest.TestCase): + def setUp(self) -> None: + self.linted_sqls = [] + self.subprocess_run_args = [] + + self.dbt_project_tmp_dir = pathlib.Path(tempfile.mkdtemp()) + shutil.copyfile( + goldens_dir_path.joinpath("dbt_project.yml"), + self.dbt_project_tmp_dir.joinpath("dbt_project.yml"), + ) + + def tearDown(self) -> None: + shutil.rmtree(self.dbt_project_tmp_dir) + + def _mock_run(self, args: List[str]): + self.subprocess_run_args.append(args) + + def _mock_run_raise_error(self, args: List[str]): + self.subprocess_run_args.append(args) + raise SubprocessNonZeroExitError("sqlfluff", 65) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + def test_lint_sqls_with_errors(self): + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint.subprocess_run", self._mock_run_raise_error + ): + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 1, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertIsInstance(result.exception, SQLLintError) + self.assertTrue(any(["lint" in sargs for sargs in self.subprocess_run_args])) + self.assertFalse(any(["fix" in sargs for sargs in self.subprocess_run_args])) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + def test_lint_sqls_without_errors(self): + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint.subprocess_run", self._mock_run + ): + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 0, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertTrue(any(["lint" in sargs for sargs in self.subprocess_run_args])) + self.assertFalse(any(["fix" in sargs for sargs in self.subprocess_run_args])) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + def test_fix_sqls(self): + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint.subprocess_run", self._mock_run + ): + runner = CliRunner() + result = runner.invoke(_cli, ["lint"]) + self.assertEqual( + 0, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertTrue(any(["lint" in sargs for sargs in self.subprocess_run_args])) + self.assertTrue(any(["fix" in sargs for sargs in self.subprocess_run_args])) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("data_pipelines_cli.cli_commands.lint.subprocess_run") + def test_raise_unexpected_error(self, subprocess_mock): + for err in [ + ConnectionAbortedError, + FileNotFoundError, + FileExistsError, + KeyError, + FloatingPointError, + ]: + with self.subTest(exception=err), patch( + "pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir) + ): + subprocess_mock.side_effect = err + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 1, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertIsInstance(result.exception, err) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("data_pipelines_cli.cli_commands.lint.subprocess_run") + def test_raise_different_subprocess_error(self, subprocess_mock): + subprocess_mock.side_effect = SubprocessNonZeroExitError("sqlfluff", 248) + + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)): + runner = CliRunner() + result = runner.invoke(_cli, ["lint", "--no-fix"]) + self.assertEqual( + 1, result.exit_code, msg="\n".join([str(result.exception), str(result.output)]) + ) + self.assertIsInstance(result.exception, SubprocessNonZeroExitError) + self.assertEqual(248, result.exception.exit_code) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("data_pipelines_cli.cli_commands.lint.subprocess_run") + def test_raise_wrong_dialect_error(self, subprocess_mock): + subprocess_mock.side_effect = SubprocessNonZeroExitError("sqlfluff", 66) + + with patch("pathlib.Path.cwd", lambda: pathlib.Path(self.dbt_project_tmp_dir)), patch( + "data_pipelines_cli.cli_commands.lint._get_dialect_or_default", lambda: "some_dialect" + ): + runner = CliRunner() + runner.invoke(_cli, ["lint"]) + self.assertEqual(2, subprocess_mock.call_count) + + +class LintHelperFunctionsTestCase(unittest.TestCase): + def test_get_dialect(self): + build_dir_mock = MagicMock() + build_dir_mock.configure_mock(**{"joinpath": lambda _self, *_args: goldens_dir_path}) + + with patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", build_dir_mock): + self.assertEqual("bigquery", _get_dialect_or_default()) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("pathlib.Path.cwd", lambda: goldens_dir_path) + def test_get_dialect_no_build_dir(self): + self.assertEqual("bigquery", _get_dialect_or_default()) + + @patch("data_pipelines_cli.cli_commands.lint.BUILD_DIR", pathlib.Path("/a/b/c/d/e/f")) + @patch("pathlib.Path.cwd", lambda: pathlib.Path("/a/b/c/d/e/f")) + def test_default_dialect(self): + self.assertEqual("ansi", _get_dialect_or_default()) + + def test_get_source_tests_paths(self): + with tempfile.TemporaryDirectory() as tmp_dir, patch( + "pathlib.Path.cwd", lambda: pathlib.Path(tmp_dir) + ): + with open(goldens_dir_path.joinpath("dbt_project.yml"), "r") as orig_dbt, open( + pathlib.Path(tmp_dir).joinpath("dbt_project.yml"), "w" + ) as tmp_dbt: + dbt_project = yaml.safe_load(orig_dbt) + dbt_project["source-paths"] = ["models", "models2", "models3"] + yaml.dump(dbt_project, tmp_dbt) + + self.assertSetEqual( + { + pathlib.Path(tmp_dir).joinpath(dir_name) + for dir_name in ["models", "models2", "models3", "tests"] + }, + set(_get_source_tests_paths()), + ) From 47f41093f2af294f7186108d6440d89e259a18e7 Mon Sep 17 00:00:00 2001 From: Andrzej Swatowski Date: Fri, 15 Apr 2022 16:19:30 +0200 Subject: [PATCH 2/6] Update sqlfluff to 0.12.0 --- data_pipelines_cli/cli_commands/lint.py | 2 ++ setup.py | 8 +++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/data_pipelines_cli/cli_commands/lint.py b/data_pipelines_cli/cli_commands/lint.py index 0cef69c..138355f 100644 --- a/data_pipelines_cli/cli_commands/lint.py +++ b/data_pipelines_cli/cli_commands/lint.py @@ -12,6 +12,7 @@ generate_profiles_yml, read_dictionary_from_config_directory, ) +from ..dbt_utils import read_dbt_vars_from_configs from ..errors import SQLLintError, SubprocessNonZeroExitError SQLFLUFF_FIX_NOT_EVERYTHING_ERROR = 1 @@ -53,6 +54,7 @@ def _create_temporary_sqlfluff_config(env: str) -> ConfigParser: config["sqlfluff:templater:dbt"] = { "profiles_dir": str(generate_profiles_yml(env, copy_config_dir=True).absolute()) } + config["sqlfluff:templater:dbt:context"] = read_dbt_vars_from_configs(env) return config diff --git a/setup.py b/setup.py index 6b6dcc8..725685f 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,8 @@ "questionary==1.10.0", "pyyaml>=5.1, <6.0", "types-PyYAML>=6.0", - "copier==5.1.0", + # due to the 'regex' conflict between copier and sqlfluff: + "copier @ git+https://github.com/swtwsk/copier@regex-update", "Jinja2>=2.11,<2.12", "fsspec", "packaging>=20.4,<21.0", @@ -33,10 +34,7 @@ "docker": ["docker>=5.0"], "datahub": ["acryl-datahub>=0.8.17, <0.8.18"], "git": ["GitPython==3.1.26"], - "lint": [ - "sqlfluff==0.11.1", - "sqlfluff-templater-dbt==0.11.1", - ], + "lint": ["sqlfluff==0.12.0", "sqlfluff-templater-dbt==0.12.0"], "tests": [ "pytest>=6.2.2, <7.0.0", "pytest-cov>=2.8.0, <3.0.0", From 060d66df420b826fce51558313b78c487c13c5a8 Mon Sep 17 00:00:00 2001 From: Andrzej Swatowski Date: Fri, 15 Apr 2022 17:23:26 +0200 Subject: [PATCH 3/6] Read an existing .sqlfluff config --- data_pipelines_cli/cli_commands/lint.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/data_pipelines_cli/cli_commands/lint.py b/data_pipelines_cli/cli_commands/lint.py index 138355f..f85e667 100644 --- a/data_pipelines_cli/cli_commands/lint.py +++ b/data_pipelines_cli/cli_commands/lint.py @@ -1,7 +1,7 @@ import pathlib import tempfile from configparser import ConfigParser -from typing import List +from typing import Any, List import click import yaml @@ -48,13 +48,27 @@ def _get_source_tests_paths() -> List[pathlib.Path]: return list(map(lambda dir_name: pathlib.Path.cwd().joinpath(dir_name), dir_names)) +def _insert_into_config_section(config: ConfigParser, section: str, key: str, value: Any) -> None: + if section not in config: + config[section] = {} + config[section][key] = value + + def _create_temporary_sqlfluff_config(env: str) -> ConfigParser: + sqlfluff_config_path = pathlib.Path.cwd().joinpath(".sqlfluff") config = ConfigParser() - config["sqlfluff"] = {"templater": "dbt"} - config["sqlfluff:templater:dbt"] = { - "profiles_dir": str(generate_profiles_yml(env, copy_config_dir=True).absolute()) - } + if sqlfluff_config_path.exists(): + config.read(sqlfluff_config_path) + + _insert_into_config_section(config, "sqlfluff", "templater", "dbt") + _insert_into_config_section( + config, + "sqlfluff:templater:dbt", + "profiles_dir", + str(generate_profiles_yml(env, copy_config_dir=True).absolute()), + ) config["sqlfluff:templater:dbt:context"] = read_dbt_vars_from_configs(env) + return config From fbcfce142313afc4025331fe679a1ffec388e252 Mon Sep 17 00:00:00 2001 From: Andrzej Swatowski Date: Fri, 15 Apr 2022 17:29:55 +0200 Subject: [PATCH 4/6] Bump packaging --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 725685f..8e339d3 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ "copier @ git+https://github.com/swtwsk/copier@regex-update", "Jinja2>=2.11,<2.12", "fsspec", - "packaging>=20.4,<21.0", + "packaging>=20.9,<21.0", ] EXTRA_FILESYSTEMS_REQUIRE = { From ef05e0e3968f8429eea5b1c89284c8a291468fb3 Mon Sep 17 00:00:00 2001 From: Andrzej Swatowski Date: Tue, 19 Apr 2022 10:41:16 +0200 Subject: [PATCH 5/6] Update docs requirements.txt --- docs/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/requirements.txt b/docs/requirements.txt index 6ecdde1..763f946 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,3 +3,5 @@ sphinx-click>=4.0,<4.1 myst-parser>=0.17, <0.18 docutils>=0.17,<0.18 GitPython==3.1.26 +git+https://github.com/swtwsk/copier@regex-update + From cf74371385df0a2c20d5e0b30d530b5301c418f8 Mon Sep 17 00:00:00 2001 From: Andrzej Swatowski Date: Tue, 19 Apr 2022 11:29:22 +0200 Subject: [PATCH 6/6] Update sqlfluff expectations --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8e339d3..097a7ae 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ "docker": ["docker>=5.0"], "datahub": ["acryl-datahub>=0.8.17, <0.8.18"], "git": ["GitPython==3.1.26"], - "lint": ["sqlfluff==0.12.0", "sqlfluff-templater-dbt==0.12.0"], + "lint": ["sqlfluff>=0.12.0,<1.0", "sqlfluff-templater-dbt>=0.12.0,<1.0"], "tests": [ "pytest>=6.2.2, <7.0.0", "pytest-cov>=2.8.0, <3.0.0",