From ea360045c9e2c1e13d218136c1c6492568cc422e Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Sat, 30 Sep 2023 15:45:44 +0200 Subject: [PATCH 001/134] Turning follow_external_dependency into bool or dict property --- dagger/pipeline/io.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/dagger/pipeline/io.py b/dagger/pipeline/io.py index 452798f..32ae303 100644 --- a/dagger/pipeline/io.py +++ b/dagger/pipeline/io.py @@ -19,9 +19,15 @@ def init_attributes(cls, orig_cls): Attribute( attribute_name="follow_external_dependency", required=False, - comment="Weather an external task sensor should be created if this dataset" - "is created in another pipeline. Default is False", + format_help="dictionary or boolean", + comment="External Task Sensor parameters in key value format: https://airflow.apache.org/docs/apache-airflow/stable/_api/airflow/sensors/base/index.html" ), + # Attribute( + # attribute_name="follow_external_dependency", + # required=False, + # comment="Weather an external task sensor should be created if this dataset" + # "is created in another pipeline. Default is False", + # ), ] ) @@ -34,7 +40,17 @@ def __init__(self, io_config, config_location): self._has_dependency = self.parse_attribute("has_dependency") if self._has_dependency is None: self._has_dependency = True - self._follow_external_dependency = self.parse_attribute("follow_external_dependency") or False + + follow_external_dependency = self.parse_attribute("follow_external_dependency") + if follow_external_dependency is not None: + if isinstance(follow_external_dependency, bool): + if follow_external_dependency: + follow_external_dependency = dict() + else: + follow_external_dependency = None + else: + follow_external_dependency = dict(follow_external_dependency) + self._follow_external_dependency = follow_external_dependency def __eq__(self, other): return self.alias() == other.alias() From 5113752cee666d61d1e52901d9f1b0121cf4de52 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Sat, 30 Sep 2023 15:47:07 +0200 Subject: [PATCH 002/134] Handling the new dict format of follow_external_dependency --- dagger/dag_creator/airflow/dag_creator.py | 18 ++++++++++++------ dagger/graph/task_graph.py | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/dagger/dag_creator/airflow/dag_creator.py b/dagger/dag_creator/airflow/dag_creator.py index 70358e8..5db416f 100644 --- a/dagger/dag_creator/airflow/dag_creator.py +++ b/dagger/dag_creator/airflow/dag_creator.py @@ -58,7 +58,7 @@ def _get_external_task_sensor_name_dict(self, from_task_id: str) -> dict: "external_sensor_name": f"{from_pipeline_name}-{from_task_name}-sensor", } - def _get_external_task_sensor(self, from_task_id: str, to_task_id: str) -> ExternalTaskSensor: + def _get_external_task_sensor(self, from_task_id: str, to_task_id: str, follow_external_dependency: dict) -> ExternalTaskSensor: """ create an object of external task sensor for a specific from_task_id and to_task_id """ @@ -72,6 +72,14 @@ def _get_external_task_sensor(self, from_task_id: str, to_task_id: str) -> Exter to_pipe_id = self._task_graph.get_node(to_task_id).obj.pipeline.name + + extra_args = { + 'mode': conf.EXTERNAL_SENSOR_MODE, + 'poke_interval': conf.EXTERNAL_SENSOR_POKE_INTERVAL, + 'timeout': conf.EXTERNAL_SENSOR_TIMEOUT, + } + extra_args.update(follow_external_dependency) + return ExternalTaskSensor( dag=self._dags[to_pipe_id], task_id=external_sensor_name, @@ -80,9 +88,7 @@ def _get_external_task_sensor(self, from_task_id: str, to_task_id: str) -> Exter execution_date_fn=self._get_execution_date_fn( from_pipeline_schedule, to_pipeline_schedule ), - mode=conf.EXTERNAL_SENSOR_MODE, - poke_interval=conf.EXTERNAL_SENSOR_POKE_INTERVAL, - timeout=conf.EXTERNAL_SENSOR_TIMEOUT, + **extra_args ) def _create_control_flow_task(self, pipe_id, dag): @@ -143,7 +149,7 @@ def _create_edge_without_data(self, from_task_id: str, to_task_ids: list, node: to_pipe = self._task_graph.get_node(to_task_id).obj.pipeline_name if from_pipe and from_pipe == to_pipe: self._tasks[from_task_id] >> self._tasks[to_task_id] - elif from_pipe and from_pipe != to_pipe and edge_properties.follow_external_dependency: + elif from_pipe and from_pipe != to_pipe and edge_properties.follow_external_dependency is not None: from_schedule = self._task_graph.get_node(from_task_id).obj.pipeline.schedule to_schedule = self._task_graph.get_node(to_task_id).obj.pipeline.schedule if not from_schedule.startswith("@") and not to_schedule.startswith("@"): @@ -155,7 +161,7 @@ def _create_edge_without_data(self, from_task_id: str, to_task_ids: list, node: not in self._sensor_dict.get(to_pipe, dict()).keys() ): external_task_sensor = self._get_external_task_sensor( - from_task_id, to_task_id + from_task_id, to_task_id, edge_properties.follow_external_dependency ) self._sensor_dict[to_pipe] = { external_task_sensor_name: external_task_sensor diff --git a/dagger/graph/task_graph.py b/dagger/graph/task_graph.py index c7898f6..0f14a56 100644 --- a/dagger/graph/task_graph.py +++ b/dagger/graph/task_graph.py @@ -55,7 +55,7 @@ def add_child(self, child_id): class Edge: - def __init__(self, follow_external_dependency=False): + def __init__(self, follow_external_dependency=None): self._follow_external_dependency = follow_external_dependency @property From fc2a2e2eb530de94e64681ca2f62915186faac7a Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Sat, 30 Sep 2023 15:48:01 +0200 Subject: [PATCH 003/134] Changing test case to see if it handles sensor parameters properly --- .../root/dags/test_external_sensor/dummy_first.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/fixtures/config_finder/root/dags/test_external_sensor/dummy_first.yaml b/tests/fixtures/config_finder/root/dags/test_external_sensor/dummy_first.yaml index 279dd70..e97563f 100644 --- a/tests/fixtures/config_finder/root/dags/test_external_sensor/dummy_first.yaml +++ b/tests/fixtures/config_finder/root/dags/test_external_sensor/dummy_first.yaml @@ -5,7 +5,8 @@ inputs: # format: list | Use dagger init-io cli name: redshift_input schema: dwh table: batch_table - follow_external_dependency: True + follow_external_dependency: + poke_interval: 60 outputs: # format: list | Use dagger init-io cli - type: dummy name: first_dummy_output From db6dee2281f029093b72d6fe6e19847e01825d63 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 23 Oct 2023 14:54:59 +0200 Subject: [PATCH 004/134] upgrade version of tenacity --- reqs/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reqs/base.txt b/reqs/base.txt index b6c8400..877d102 100644 --- a/reqs/base.txt +++ b/reqs/base.txt @@ -4,4 +4,4 @@ envyaml==1.10.211231 mergedeep==1.3.4 slack==0.0.2 slackclient==2.9.4 -tenacity==8.1.0 +tenacity==8.2.0 From d4634383f84f55492621a41697a94c8b43b6ed0a Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 14:49:45 +0100 Subject: [PATCH 005/134] added new dbt config parser module --- dagger/utilities/dbt_config_parser.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 dagger/utilities/dbt_config_parser.py diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py new file mode 100644 index 0000000..e69de29 From a91a58fc8b9351f3bb3aa9f5918eb7ba5cb311f1 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 14:50:48 +0100 Subject: [PATCH 006/134] added class with constructor --- dagger/utilities/dbt_config_parser.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index e69de29..983e150 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -0,0 +1,24 @@ +from os import path +from os.path import join +from typing import Union +import json +import yaml + +ATHENA_IO_BASE = {"type": "athena"} +S3_IO_BASE = {"type": "s3"} + +class DBTConfigParser: + + def __init__(self, default_config_parameters:dict): + self._default_data_bucket = default_config_parameters["data_bucket"] + self._dbt_project_dir = default_config_parameters.get("project_dir", None) + dbt_manifest_path = path.join(self._dbt_project_dir, "target","manifest.json") + self._dbt_profile_dir = default_config_parameters.get("profile_dir", None) + dbt_profile_path = path.join(self._dbt_profile_dir, "profiles.yml") + + with open(dbt_manifest_path, "r") as f: + data = f.read() + self._manifest_data = json.loads(data) + profile_yaml = yaml.safe_load(open(dbt_profile_path, "r")) + prod_dbt_profile = profile_yaml[self._dbt_project_dir]['outputs']['data'] + self._default_data_dir = prod_dbt_profile.get('s3_data_dir') or prod_dbt_profile.get('s3_staging_dir') \ No newline at end of file From 102620e30a7bda042f3cfadb46a64a1b13258c53 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 14:53:16 +0100 Subject: [PATCH 007/134] added method to parse dbt model inputs --- dagger/utilities/dbt_config_parser.py | 42 ++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 983e150..a3ef0af 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -21,4 +21,44 @@ def __init__(self, default_config_parameters:dict): self._manifest_data = json.loads(data) profile_yaml = yaml.safe_load(open(dbt_profile_path, "r")) prod_dbt_profile = profile_yaml[self._dbt_project_dir]['outputs']['data'] - self._default_data_dir = prod_dbt_profile.get('s3_data_dir') or prod_dbt_profile.get('s3_staging_dir') \ No newline at end of file + self._default_data_dir = prod_dbt_profile.get('s3_data_dir') or prod_dbt_profile.get('s3_staging_dir') + def _get_model_data_location(self, node: dict, schema: str, dbt_model_name: str) -> str: + location = node.get("unrendered_config", {}).get("external_location") + if not location: + location = join(self._default_data_dir, schema, dbt_model_name) + + return location.split("data-lake/")[1] + + def _parse_dbt_model_inputs(self, model_name: str) -> dict: + inputs_dict = {} + inputs_list = [] + dbt_ref_to_model = f'model.{self._dbt_project_dir}.{model_name}' + + nodes = self._manifest_data['nodes'] + model_info = nodes[f'model.main.{model_name}'] + + parents_as_full_selectors = model_info.get('depends_on', {}).get('nodes', []) + inputs = [x.split('.')[-1] for x in parents_as_full_selectors] + + for index, node_name in enumerate(parents_as_full_selectors): + if not (".int_" in node_name): + dbt_parent_model_name = node_name.split('.')[-1] + parent_model_node = nodes.get(node_name) + parent_schema = parent_model_node.get('schema') + + model_data_location = self._get_model_data_location(parent_model_node, parent_schema, + dbt_parent_model_name) + + inputs_list.append({ + "schema": parent_schema, + "model_name": inputs[index], + "relative_s3_path": model_data_location + }) + + inputs_dict['model_name'] = model_name + inputs_dict['node_name'] = dbt_ref_to_model + inputs_dict['inputs'] = inputs_list + inputs_dict['schema'] = model_info['schema'] + inputs_dict['relative_s3_path'] = self._get_model_data_location(model_info, model_info['schema'], model_name) + + return inputs_dict \ No newline at end of file From c61e656d364aedcf966ae93b2cb52fbddaf89413 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 14:54:03 +0100 Subject: [PATCH 008/134] added functions to generate dagger input and outputs for dbt models --- dagger/utilities/dbt_config_parser.py | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index a3ef0af..5acb3b7 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -22,6 +22,51 @@ def __init__(self, default_config_parameters:dict): profile_yaml = yaml.safe_load(open(dbt_profile_path, "r")) prod_dbt_profile = profile_yaml[self._dbt_project_dir]['outputs']['data'] self._default_data_dir = prod_dbt_profile.get('s3_data_dir') or prod_dbt_profile.get('s3_staging_dir') + + def parse_dbt_staging_model(self, dbt_staging_model: str) -> Union[str, str]: + _model_split, core_table = dbt_staging_model.split('__') + core_schema = _model_split.split('_')[-1] + + return core_schema, core_table + + def generate_dagger_inputs(self, dbt_inputs: dict) -> Union[list[dict], None]: + dagger_inputs = [] + for dbt_input in dbt_inputs['inputs']: + model_name = dbt_input['model_name'] + athena_input = ATHENA_IO_BASE.copy() + s3_input = S3_IO_BASE.copy() + + if (model_name.startswith("stg_")): + athena_input['name'] = model_name + athena_input['schema'], athena_input['table'] = self.parse_dbt_staging_model(model_name) + + dagger_inputs.append(athena_input) + else: + athena_input['name'] = athena_input['table'] = model_name + athena_input['schema'] = dbt_input['schema'] + + s3_input['name'] = model_name + s3_input['bucket'] = self._default_data_bucket + s3_input['path'] = dbt_input['relative_s3_path'] + + dagger_inputs.append(athena_input) + dagger_inputs.append(s3_input) + + return dagger_inputs or None + + def generate_dagger_outputs(self, dbt_inputs: dict) -> list[dict]: + athena_input = ATHENA_IO_BASE.copy() + s3_input = S3_IO_BASE.copy() + + athena_input['name'] = athena_input['table'] = dbt_inputs['model_name'] + athena_input['schema'] = dbt_inputs['schema'] + + s3_input['name'] = dbt_inputs['model_name'] + s3_input['bucket'] = self._default_data_bucket + s3_input['relative_s3_path'] = dbt_inputs['relative_s3_path'] + + return [athena_input, s3_input] + def _get_model_data_location(self, node: dict, schema: str, dbt_model_name: str) -> str: location = node.get("unrendered_config", {}).get("external_location") if not location: From 29858a975c795b240555637de0d2fe079b63ea56 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 14:54:20 +0100 Subject: [PATCH 009/134] added fn to generate io for dbt task --- dagger/utilities/dbt_config_parser.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 5acb3b7..1b2fce1 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -23,6 +23,12 @@ def __init__(self, default_config_parameters:dict): prod_dbt_profile = profile_yaml[self._dbt_project_dir]['outputs']['data'] self._default_data_dir = prod_dbt_profile.get('s3_data_dir') or prod_dbt_profile.get('s3_staging_dir') + def generate_io(self, model_name: str) -> tuple[list[dict], list[dict]]: + model_inputs = self._parse_dbt_model_inputs(model_name) + model_dagger_inputs = self.generate_dagger_inputs(model_inputs) + model_dagger_outputs = self.generate_dagger_outputs(model_inputs) + return model_dagger_inputs, model_dagger_outputs + def parse_dbt_staging_model(self, dbt_staging_model: str) -> Union[str, str]: _model_split, core_table = dbt_staging_model.split('__') core_schema = _model_split.split('_')[-1] From 1a93f056d7dd4d87c02b1c761d6cfb919b6f5ab4 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 14:54:30 +0100 Subject: [PATCH 010/134] black format --- dagger/utilities/dbt_config_parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 1b2fce1..d30949b 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -7,12 +7,13 @@ ATHENA_IO_BASE = {"type": "athena"} S3_IO_BASE = {"type": "s3"} + class DBTConfigParser: - def __init__(self, default_config_parameters:dict): + def __init__(self, default_config_parameters: dict): self._default_data_bucket = default_config_parameters["data_bucket"] self._dbt_project_dir = default_config_parameters.get("project_dir", None) - dbt_manifest_path = path.join(self._dbt_project_dir, "target","manifest.json") + dbt_manifest_path = path.join(self._dbt_project_dir, "target", "manifest.json") self._dbt_profile_dir = default_config_parameters.get("profile_dir", None) dbt_profile_path = path.join(self._dbt_profile_dir, "profiles.yml") From 847f6229bba0f0a40d0fc294cc0f944330f06617 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 14:55:02 +0100 Subject: [PATCH 011/134] use new module in fn to generate configs --- dagger/utilities/module.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 6f3b395..c697ef8 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -1,6 +1,7 @@ import logging from os import path from mergedeep import merge +from dbt_config_parser import DBTConfigParser import yaml @@ -21,6 +22,7 @@ def __init__(self, path_to_config, target_dir): self._branches_to_generate = config["branches_to_generate"] self._override_parameters = config.get("override_parameters", {}) self._default_parameters = config.get("default_parameters", {}) + self._dbt_module = DBTConfigParser(self._default_parameters) @staticmethod def read_yaml(yaml_str): @@ -82,6 +84,11 @@ def generate_task_configs(self): ) task_dict = yaml.safe_load(task_str) + if task == 'dbt': + inputs, outputs = self._dbt_module.generate_io(branch_name) + task_dict['inputs'] = inputs + task_dict['outputs'] = outputs + task_dict["autogenerated_by_dagger"] = self._path_to_config override_parameters = self._override_parameters or {} merge(task_dict, override_parameters.get(branch_name, {}).get(task, {})) From 104013af971b2aaa0cb71451f56921472714df02 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 14:59:13 +0100 Subject: [PATCH 012/134] changed output bucket --- dagger/utilities/dbt_config_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index d30949b..d15b02b 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -69,7 +69,7 @@ def generate_dagger_outputs(self, dbt_inputs: dict) -> list[dict]: athena_input['schema'] = dbt_inputs['schema'] s3_input['name'] = dbt_inputs['model_name'] - s3_input['bucket'] = self._default_data_bucket + s3_input['bucket'] = "cho${ENV}-data-lake" s3_input['relative_s3_path'] = dbt_inputs['relative_s3_path'] return [athena_input, s3_input] From de82c14c60011b15e0743bf102a884a675591a6f Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 16:22:16 +0100 Subject: [PATCH 013/134] fixed import --- dagger/utilities/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index c697ef8..bba316c 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -1,7 +1,7 @@ import logging from os import path from mergedeep import merge -from dbt_config_parser import DBTConfigParser +from dagger.utilities.dbt_config_parser import DBTConfigParser import yaml From dcb6854d6f1dfa80c626dc8d6283fa3c7c3a5149 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 16:24:10 +0100 Subject: [PATCH 014/134] renamed functions and variables --- dagger/utilities/dbt_config_parser.py | 35 +++++++++++++++------------ 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index d15b02b..9912353 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -1,6 +1,6 @@ from os import path from os.path import join -from typing import Union +from typing import Union, Tuple import json import yaml @@ -25,21 +25,25 @@ def __init__(self, default_config_parameters: dict): self._default_data_dir = prod_dbt_profile.get('s3_data_dir') or prod_dbt_profile.get('s3_staging_dir') def generate_io(self, model_name: str) -> tuple[list[dict], list[dict]]: - model_inputs = self._parse_dbt_model_inputs(model_name) - model_dagger_inputs = self.generate_dagger_inputs(model_inputs) - model_dagger_outputs = self.generate_dagger_outputs(model_inputs) + """ + Generates the dagger inputs and outputs for the respective dbt model + Args: + model_parents = self._get_dbt_model_parents(model_name) + model_dagger_inputs = self.generate_dagger_inputs(model_parents) + model_dagger_outputs = self.generate_dagger_outputs(model_parents['model_name'], model_parents['schema'], model_parents['relative_s3_path']) + return model_dagger_inputs, model_dagger_outputs - def parse_dbt_staging_model(self, dbt_staging_model: str) -> Union[str, str]: + def parse_dbt_staging_model(self, dbt_staging_model: str) -> Tuple[str, str]: _model_split, core_table = dbt_staging_model.split('__') core_schema = _model_split.split('_')[-1] return core_schema, core_table - def generate_dagger_inputs(self, dbt_inputs: dict) -> Union[list[dict], None]: + def generate_dagger_inputs(self, dbt_model_parents: dict) -> Union[list[dict], None]: dagger_inputs = [] - for dbt_input in dbt_inputs['inputs']: - model_name = dbt_input['model_name'] + for parent in dbt_model_parents['inputs']: + model_name = parent['model_name'] athena_input = ATHENA_IO_BASE.copy() s3_input = S3_IO_BASE.copy() @@ -50,27 +54,26 @@ def generate_dagger_inputs(self, dbt_inputs: dict) -> Union[list[dict], None]: dagger_inputs.append(athena_input) else: athena_input['name'] = athena_input['table'] = model_name - athena_input['schema'] = dbt_input['schema'] + athena_input['schema'] = parent['schema'] s3_input['name'] = model_name s3_input['bucket'] = self._default_data_bucket - s3_input['path'] = dbt_input['relative_s3_path'] + s3_input['path'] = parent['relative_s3_path'] dagger_inputs.append(athena_input) dagger_inputs.append(s3_input) return dagger_inputs or None - def generate_dagger_outputs(self, dbt_inputs: dict) -> list[dict]: + def generate_dagger_outputs(self, model_name: str, schema: str, relative_s3_path: str) -> list[dict]: athena_input = ATHENA_IO_BASE.copy() s3_input = S3_IO_BASE.copy() - athena_input['name'] = athena_input['table'] = dbt_inputs['model_name'] - athena_input['schema'] = dbt_inputs['schema'] + athena_input['name'] = athena_input['table'] = s3_input['name'] = model_name + athena_input['schema'] = schema - s3_input['name'] = dbt_inputs['model_name'] s3_input['bucket'] = "cho${ENV}-data-lake" - s3_input['relative_s3_path'] = dbt_inputs['relative_s3_path'] + s3_input['relative_s3_path'] = relative_s3_path return [athena_input, s3_input] @@ -81,7 +84,7 @@ def _get_model_data_location(self, node: dict, schema: str, dbt_model_name: str) return location.split("data-lake/")[1] - def _parse_dbt_model_inputs(self, model_name: str) -> dict: + def _get_dbt_model_parents(self, model_name: str) -> dict: inputs_dict = {} inputs_list = [] dbt_ref_to_model = f'model.{self._dbt_project_dir}.{model_name}' From fb6f8f6be543e6bbea91f227126af0dc1fac0b8e Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 16:24:26 +0100 Subject: [PATCH 015/134] added type hints and docstrings --- dagger/utilities/dbt_config_parser.py | 63 ++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 9912353..8262933 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -9,7 +9,9 @@ class DBTConfigParser: - + """ + Module that parses the manifest.json file generated by dbt and generates the dagger inputs and outputs for the respective dbt model + """ def __init__(self, default_config_parameters: dict): self._default_data_bucket = default_config_parameters["data_bucket"] self._dbt_project_dir = default_config_parameters.get("project_dir", None) @@ -28,6 +30,12 @@ def generate_io(self, model_name: str) -> tuple[list[dict], list[dict]]: """ Generates the dagger inputs and outputs for the respective dbt model Args: + model_name: name of the dbt model + + Returns: + tuple[list[dict], list[dict]]: dagger inputs and outputs for the respective dbt model + + """ model_parents = self._get_dbt_model_parents(model_name) model_dagger_inputs = self.generate_dagger_inputs(model_parents) model_dagger_outputs = self.generate_dagger_outputs(model_parents['model_name'], model_parents['schema'], model_parents['relative_s3_path']) @@ -35,12 +43,31 @@ def generate_io(self, model_name: str) -> tuple[list[dict], list[dict]]: return model_dagger_inputs, model_dagger_outputs def parse_dbt_staging_model(self, dbt_staging_model: str) -> Tuple[str, str]: + """ + Parses the dbt staging model to get the core schema and table name + Args: + dbt_staging_model: name of the DBT staging model + + Returns: + Tuple[str, str]: core schema and table name + """ _model_split, core_table = dbt_staging_model.split('__') core_schema = _model_split.split('_')[-1] return core_schema, core_table def generate_dagger_inputs(self, dbt_model_parents: dict) -> Union[list[dict], None]: + """ + Generates the dagger inputs for the respective dbt model. This means that all parents of the dbt model are added as dagger inputs. + Staging models are added as Athena inputs and core models are added as Athena and S3 inputs. + Intermediate models are not added as an input. + Args: + dbt_model_parents: All parents of the dbt model + + Returns: + Union[list[dict], None]: dagger inputs for the respective dbt model. If there are no parents, returns None + + """ dagger_inputs = [] for parent in dbt_model_parents['inputs']: model_name = parent['model_name'] @@ -66,6 +93,18 @@ def generate_dagger_inputs(self, dbt_model_parents: dict) -> Union[list[dict], N return dagger_inputs or None def generate_dagger_outputs(self, model_name: str, schema: str, relative_s3_path: str) -> list[dict]: + """ + Generates the dagger outputs for the respective dbt model. + This means that an Athena and S3 output is added for the dbt model. + Args: + model_name: The name of the dbt model + schema: The schema of the dbt model + relative_s3_path: The S3 path of the dbt model relative to the data bucket + + Returns: + list[dict]: dagger S3 and Athena outputs for the respective dbt model + + """ athena_input = ATHENA_IO_BASE.copy() s3_input = S3_IO_BASE.copy() @@ -78,6 +117,19 @@ def generate_dagger_outputs(self, model_name: str, schema: str, relative_s3_path return [athena_input, s3_input] def _get_model_data_location(self, node: dict, schema: str, dbt_model_name: str) -> str: + """ + Gets the S3 path of the dbt model relative to the data bucket. + If external location is not specified in the DBT model config, then the default data directory from the + DBT profiles configuration is used. + Args: + node: The extracted node from the manifest.json file + schema: The schema of the dbt model + dbt_model_name: The name of the dbt model + + Returns: + str: The S3 path of the dbt model relative to the data bucket + + """ location = node.get("unrendered_config", {}).get("external_location") if not location: location = join(self._default_data_dir, schema, dbt_model_name) @@ -85,6 +137,15 @@ def _get_model_data_location(self, node: dict, schema: str, dbt_model_name: str) return location.split("data-lake/")[1] def _get_dbt_model_parents(self, model_name: str) -> dict: + """ + Gets all parents of a single dbt model from the manifest.json file + Args: + model_name: The name of the DBT model + + Returns: + dict: All parents of the dbt model along with the name, schema and S3 path of the dbt model itself + + """ inputs_dict = {} inputs_list = [] dbt_ref_to_model = f'model.{self._dbt_project_dir}.{model_name}' From 06dc86bf47584712eaef1267124278d28ff26ebe Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 16:24:55 +0100 Subject: [PATCH 016/134] changed how models are selected when config is generated --- dagger/utilities/module.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index bba316c..ee7540d 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -88,6 +88,7 @@ def generate_task_configs(self): inputs, outputs = self._dbt_module.generate_io(branch_name) task_dict['inputs'] = inputs task_dict['outputs'] = outputs + task_dict['task_parameters']['select'] = branch_name task_dict["autogenerated_by_dagger"] = self._path_to_config override_parameters = self._override_parameters or {} From 19a13bce8cbeb7ac9d9d444cc8dd623e19ce7564 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 15 Nov 2023 16:28:25 +0100 Subject: [PATCH 017/134] black --- dagger/utilities/dbt_config_parser.py | 113 +++++++++++++++----------- 1 file changed, 67 insertions(+), 46 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 8262933..03d3d66 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -12,6 +12,7 @@ class DBTConfigParser: """ Module that parses the manifest.json file generated by dbt and generates the dagger inputs and outputs for the respective dbt model """ + def __init__(self, default_config_parameters: dict): self._default_data_bucket = default_config_parameters["data_bucket"] self._dbt_project_dir = default_config_parameters.get("project_dir", None) @@ -23,8 +24,10 @@ def __init__(self, default_config_parameters: dict): data = f.read() self._manifest_data = json.loads(data) profile_yaml = yaml.safe_load(open(dbt_profile_path, "r")) - prod_dbt_profile = profile_yaml[self._dbt_project_dir]['outputs']['data'] - self._default_data_dir = prod_dbt_profile.get('s3_data_dir') or prod_dbt_profile.get('s3_staging_dir') + prod_dbt_profile = profile_yaml[self._dbt_project_dir]["outputs"]["data"] + self._default_data_dir = prod_dbt_profile.get( + "s3_data_dir" + ) or prod_dbt_profile.get("s3_staging_dir") def generate_io(self, model_name: str) -> tuple[list[dict], list[dict]]: """ @@ -38,7 +41,11 @@ def generate_io(self, model_name: str) -> tuple[list[dict], list[dict]]: """ model_parents = self._get_dbt_model_parents(model_name) model_dagger_inputs = self.generate_dagger_inputs(model_parents) - model_dagger_outputs = self.generate_dagger_outputs(model_parents['model_name'], model_parents['schema'], model_parents['relative_s3_path']) + model_dagger_outputs = self.generate_dagger_outputs( + model_parents["model_name"], + model_parents["schema"], + model_parents["relative_s3_path"], + ) return model_dagger_inputs, model_dagger_outputs @@ -51,12 +58,14 @@ def parse_dbt_staging_model(self, dbt_staging_model: str) -> Tuple[str, str]: Returns: Tuple[str, str]: core schema and table name """ - _model_split, core_table = dbt_staging_model.split('__') - core_schema = _model_split.split('_')[-1] + _model_split, core_table = dbt_staging_model.split("__") + core_schema = _model_split.split("_")[-1] return core_schema, core_table - def generate_dagger_inputs(self, dbt_model_parents: dict) -> Union[list[dict], None]: + def generate_dagger_inputs( + self, dbt_model_parents: dict + ) -> Union[list[dict], None]: """ Generates the dagger inputs for the respective dbt model. This means that all parents of the dbt model are added as dagger inputs. Staging models are added as Athena inputs and core models are added as Athena and S3 inputs. @@ -69,30 +78,35 @@ def generate_dagger_inputs(self, dbt_model_parents: dict) -> Union[list[dict], N """ dagger_inputs = [] - for parent in dbt_model_parents['inputs']: - model_name = parent['model_name'] + for parent in dbt_model_parents["inputs"]: + model_name = parent["model_name"] athena_input = ATHENA_IO_BASE.copy() s3_input = S3_IO_BASE.copy() - if (model_name.startswith("stg_")): - athena_input['name'] = model_name - athena_input['schema'], athena_input['table'] = self.parse_dbt_staging_model(model_name) + if model_name.startswith("stg_"): + athena_input["name"] = model_name + ( + athena_input["schema"], + athena_input["table"], + ) = self.parse_dbt_staging_model(model_name) dagger_inputs.append(athena_input) else: - athena_input['name'] = athena_input['table'] = model_name - athena_input['schema'] = parent['schema'] + athena_input["name"] = athena_input["table"] = model_name + athena_input["schema"] = parent["schema"] - s3_input['name'] = model_name - s3_input['bucket'] = self._default_data_bucket - s3_input['path'] = parent['relative_s3_path'] + s3_input["name"] = model_name + s3_input["bucket"] = self._default_data_bucket + s3_input["path"] = parent["relative_s3_path"] dagger_inputs.append(athena_input) dagger_inputs.append(s3_input) return dagger_inputs or None - def generate_dagger_outputs(self, model_name: str, schema: str, relative_s3_path: str) -> list[dict]: + def generate_dagger_outputs( + self, model_name: str, schema: str, relative_s3_path: str + ) -> list[dict]: """ Generates the dagger outputs for the respective dbt model. This means that an Athena and S3 output is added for the dbt model. @@ -108,15 +122,17 @@ def generate_dagger_outputs(self, model_name: str, schema: str, relative_s3_path athena_input = ATHENA_IO_BASE.copy() s3_input = S3_IO_BASE.copy() - athena_input['name'] = athena_input['table'] = s3_input['name'] = model_name - athena_input['schema'] = schema + athena_input["name"] = athena_input["table"] = s3_input["name"] = model_name + athena_input["schema"] = schema - s3_input['bucket'] = "cho${ENV}-data-lake" - s3_input['relative_s3_path'] = relative_s3_path + s3_input["bucket"] = "cho${ENV}-data-lake" + s3_input["relative_s3_path"] = relative_s3_path return [athena_input, s3_input] - def _get_model_data_location(self, node: dict, schema: str, dbt_model_name: str) -> str: + def _get_model_data_location( + self, node: dict, schema: str, dbt_model_name: str + ) -> str: """ Gets the S3 path of the dbt model relative to the data bucket. If external location is not specified in the DBT model config, then the default data directory from the @@ -148,33 +164,38 @@ def _get_dbt_model_parents(self, model_name: str) -> dict: """ inputs_dict = {} inputs_list = [] - dbt_ref_to_model = f'model.{self._dbt_project_dir}.{model_name}' + dbt_ref_to_model = f"model.{self._dbt_project_dir}.{model_name}" - nodes = self._manifest_data['nodes'] - model_info = nodes[f'model.main.{model_name}'] + nodes = self._manifest_data["nodes"] + model_info = nodes[f"model.main.{model_name}"] - parents_as_full_selectors = model_info.get('depends_on', {}).get('nodes', []) - inputs = [x.split('.')[-1] for x in parents_as_full_selectors] + parents_as_full_selectors = model_info.get("depends_on", {}).get("nodes", []) + inputs = [x.split(".")[-1] for x in parents_as_full_selectors] for index, node_name in enumerate(parents_as_full_selectors): if not (".int_" in node_name): - dbt_parent_model_name = node_name.split('.')[-1] + dbt_parent_model_name = node_name.split(".")[-1] parent_model_node = nodes.get(node_name) - parent_schema = parent_model_node.get('schema') - - model_data_location = self._get_model_data_location(parent_model_node, parent_schema, - dbt_parent_model_name) - - inputs_list.append({ - "schema": parent_schema, - "model_name": inputs[index], - "relative_s3_path": model_data_location - }) - - inputs_dict['model_name'] = model_name - inputs_dict['node_name'] = dbt_ref_to_model - inputs_dict['inputs'] = inputs_list - inputs_dict['schema'] = model_info['schema'] - inputs_dict['relative_s3_path'] = self._get_model_data_location(model_info, model_info['schema'], model_name) - - return inputs_dict \ No newline at end of file + parent_schema = parent_model_node.get("schema") + + model_data_location = self._get_model_data_location( + parent_model_node, parent_schema, dbt_parent_model_name + ) + + inputs_list.append( + { + "schema": parent_schema, + "model_name": inputs[index], + "relative_s3_path": model_data_location, + } + ) + + inputs_dict["model_name"] = model_name + inputs_dict["node_name"] = dbt_ref_to_model + inputs_dict["inputs"] = inputs_list + inputs_dict["schema"] = model_info["schema"] + inputs_dict["relative_s3_path"] = self._get_model_data_location( + model_info, model_info["schema"], model_name + ) + + return inputs_dict From 5f83797d424b14dfe373b5efa660cba10e13725d Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:45:30 +0100 Subject: [PATCH 018/134] fix project dir to load the correct profile --- dagger/utilities/dbt_config_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 03d3d66..5f81dbf 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -24,7 +24,7 @@ def __init__(self, default_config_parameters: dict): data = f.read() self._manifest_data = json.loads(data) profile_yaml = yaml.safe_load(open(dbt_profile_path, "r")) - prod_dbt_profile = profile_yaml[self._dbt_project_dir]["outputs"]["data"] + prod_dbt_profile = profile_yaml[self._dbt_project_dir.split("/")[-1]]["outputs"]["data"] self._default_data_dir = prod_dbt_profile.get( "s3_data_dir" ) or prod_dbt_profile.get("s3_staging_dir") From 209efd605d1947e6ff40eaef80647fa54227da84 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:46:11 +0100 Subject: [PATCH 019/134] get external_location from config instead of unrendered config --- dagger/utilities/dbt_config_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 5f81dbf..cfa3c81 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -146,7 +146,7 @@ def _get_model_data_location( str: The S3 path of the dbt model relative to the data bucket """ - location = node.get("unrendered_config", {}).get("external_location") + location = node.get("config", {}).get("external_location") if not location: location = join(self._default_data_dir, schema, dbt_model_name) From 3c4e4c369d36a2e5b619d2aeb32a621e862e4344 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:46:49 +0100 Subject: [PATCH 020/134] renamed variables for better understanding --- dagger/utilities/dbt_config_parser.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index cfa3c81..801bb3d 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -169,23 +169,23 @@ def _get_dbt_model_parents(self, model_name: str) -> dict: nodes = self._manifest_data["nodes"] model_info = nodes[f"model.main.{model_name}"] - parents_as_full_selectors = model_info.get("depends_on", {}).get("nodes", []) - inputs = [x.split(".")[-1] for x in parents_as_full_selectors] + parent_node_names = model_info.get("depends_on", {}).get("nodes", []) + parent_model_names = [x.split(".")[-1] for x in parent_node_names] - for index, node_name in enumerate(parents_as_full_selectors): - if not (".int_" in node_name): - dbt_parent_model_name = node_name.split(".")[-1] - parent_model_node = nodes.get(node_name) + for index, parent_node_name in enumerate(parent_node_names): + if not (".int_" in parent_node_name): + parent_model_name = parent_node_name.split(".")[-1] + parent_model_node = nodes.get(parent_node_name) parent_schema = parent_model_node.get("schema") model_data_location = self._get_model_data_location( - parent_model_node, parent_schema, dbt_parent_model_name + parent_model_node, parent_schema, parent_model_name ) inputs_list.append( { "schema": parent_schema, - "model_name": inputs[index], + "model_name": parent_model_names[index], "relative_s3_path": model_data_location, } ) From 1bd525cdc0761080d0f8d42e2cb51b09d17d55c3 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:47:15 +0100 Subject: [PATCH 021/134] added doctest for fn --- dagger/utilities/dbt_config_parser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 801bb3d..12f5e92 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -57,6 +57,11 @@ def parse_dbt_staging_model(self, dbt_staging_model: str) -> Tuple[str, str]: Returns: Tuple[str, str]: core schema and table name + + >>> parse_dbt_staging_model("schema_name__table") + ('schema_name', 'table') + >>> parse_dbt_staging_model("another_schema__another_table") + ('another_schema', 'another_table') """ _model_split, core_table = dbt_staging_model.split("__") core_schema = _model_split.split("_")[-1] From 4b651ad24ba3598f2e342ce070367ccd4adf757a Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:48:09 +0100 Subject: [PATCH 022/134] added files for fixtures and tests --- tests/fixtures/modules/dbt_config_parser_fixtures.py | 0 tests/utilities/test_dbt_config_parser.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/fixtures/modules/dbt_config_parser_fixtures.py create mode 100644 tests/utilities/test_dbt_config_parser.py diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py new file mode 100644 index 0000000..e69de29 From 7123cbcddbc0830d5440ec263d067acc9d9cfa47 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:49:08 +0100 Subject: [PATCH 023/134] added fixture for manifest and profiles --- .../modules/dbt_config_parser_fixtures.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index e69de29..96ac1fb 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -0,0 +1,67 @@ +DBT_MANIFEST_FILE_FIXTURE = { + "nodes": { + "model.main.model1": { + "database": "awsdatacatalog", + "schema": "analytics_engineering", + "name": "fct_supplier_revenue", + "config": { + "external_location": "s3://bucket1-data-lake/path1/model1", + "materialized": "incremental", + "incremental_strategy": "insert_overwrite", + }, + "description": "Details of revenue calculation at supplier level for each observation day", + "tags": ["daily"], + "unrendered_config": { + "materialized": "incremental", + "external_location": "s3://bucket1-data-lake/path1/model1", + "incremental_strategy": "insert_overwrite", + "partitioned_by": ["year", "month", "day", "dt"], + "tags": ["daily"], + "on_schema_change": "fail", + }, + "depends_on": { + "macros": [ + "macro.main.macro1", + "macro.main.macro2", + ], + "nodes": [ + "model.main.stg_core_schema1__table1", + "model.main.model2", + "model.main.int_model3", + ], + }, + }, + "model.main.stg_core_schema1__table1": { + "schema": "analytics_engineering", + }, + "model.main.model2": { + "schema": "analytics_engineering", + "config": { + "external_location": "s3://bucket1-data-lake/path2/model2", + }, + }, + "model.main.int_model3": { + "schema": "analytics_engineering", + }, + } +} + +DBT_PROFILE_FIXTURE = { + "main": { + "outputs": { + "data": { + "aws_profile_name": "data", + "database": "awsdatacatalog", + "num_retries": 10, + "region_name": "eu-west-1", + "s3_data_dir": "s3://bucket1-data-lake/path1/tmp", + "s3_data_naming": "schema_table", + "s3_staging_dir": "s3://bucket1-data-lake/path1/", + "schema": "analytics_engineering", + "threads": 4, + "type": "athena", + "work_group": "primary", + }, + } + } +} \ No newline at end of file From 77fb2b24b4d83bb2b8c742daf4ae5b6c5be4a7d3 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:51:09 +0100 Subject: [PATCH 024/134] setUp test class --- tests/utilities/test_dbt_config_parser.py | 30 +++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index e69de29..d383c1b 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -0,0 +1,30 @@ +import logging +import unittest +from unittest import skip +from unittest.mock import patch, MagicMock + +from dagger.utilities.dbt_config_parser import DBTConfigParser +from dagger.utilities.module import Module +from tests.fixtures.modules.dbt_config_parser_fixtures import ( + EXPECTED_DBT_MODEL_PARENTS, + EXPECTED_DAGGER_INPUTS, + DBT_MANIFEST_FILE_FIXTURE, + DBT_PROFILE_FIXTURE, + EXPECTED_DAGGER_OUTPUTS, +) + +_logger = logging.getLogger("root") + +DEFAULT_CONFIG_PARAMS = { + "data_bucket": "bucket1-data-lake", + "project_dir": "main", + "profile_dir": ".dbt", +} + + +class TestDBTConfigParser(unittest.TestCase): + @patch("builtins.open", new_callable=MagicMock, read_data=DBT_MANIFEST_FILE_FIXTURE) + @patch("json.loads", return_value=DBT_MANIFEST_FILE_FIXTURE) + @patch("yaml.safe_load", return_value=DBT_PROFILE_FIXTURE) + def setUp(self, mock_open, mock_json_load, mock_safe_load): + self._dbt_config_parser = DBTConfigParser(DEFAULT_CONFIG_PARAMS) From 47c063856b7aa8addf8d3f0fdfd7779eaf510441 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:53:08 +0100 Subject: [PATCH 025/134] added test for get_dbt_model_parents --- .../modules/dbt_config_parser_fixtures.py | 19 +++++++++++++++++++ tests/utilities/test_dbt_config_parser.py | 7 +++++++ 2 files changed, 26 insertions(+) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index 96ac1fb..372153a 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -64,4 +64,23 @@ }, } } +} + +EXPECTED_DBT_MODEL_PARENTS = { + "inputs": [ + { + "model_name": "stg_core_schema1__table1", + "relative_s3_path": "path1/tmp/analytics_engineering/stg_core_schema1__table1", + "schema": "analytics_engineering", + }, + { + "model_name": "model2", + "relative_s3_path": "path2/model2", + "schema": "analytics_engineering", + }, + ], + "model_name": "model1", + "node_name": "model.main.model1", + "relative_s3_path": "path1/model1", + "schema": "analytics_engineering", } \ No newline at end of file diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index d383c1b..4835b2b 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -20,6 +20,7 @@ "project_dir": "main", "profile_dir": ".dbt", } +MODEL_NAME = "model1" class TestDBTConfigParser(unittest.TestCase): @@ -28,3 +29,9 @@ class TestDBTConfigParser(unittest.TestCase): @patch("yaml.safe_load", return_value=DBT_PROFILE_FIXTURE) def setUp(self, mock_open, mock_json_load, mock_safe_load): self._dbt_config_parser = DBTConfigParser(DEFAULT_CONFIG_PARAMS) + + + def test_get_dbt_model_parents(self): + result = self._dbt_config_parser._get_dbt_model_parents(MODEL_NAME) + + self.assertDictEqual(result, EXPECTED_DBT_MODEL_PARENTS) \ No newline at end of file From 4eced6b6685758bb9e7bc3080c930afe9d813ff6 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:53:40 +0100 Subject: [PATCH 026/134] added tests for generate_dagger_inputs --- .../modules/dbt_config_parser_fixtures.py | 23 ++++++++++++++++++- tests/utilities/test_dbt_config_parser.py | 9 +++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index 372153a..ea6acd3 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -83,4 +83,25 @@ "node_name": "model.main.model1", "relative_s3_path": "path1/model1", "schema": "analytics_engineering", -} \ No newline at end of file +} + +EXPECTED_DAGGER_INPUTS = [ + { + "name": "stg_core_schema1__table1", + "schema": "schema1", + "table": "table1", + "type": "athena", + }, + { + "name": "model2", + "schema": "analytics_engineering", + "table": "model2", + "type": "athena", + }, + { + "bucket": "bucket1-data-lake", + "name": "model2", + "path": "path2/model2", + "type": "s3", + }, +] \ No newline at end of file diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 4835b2b..a882760 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -34,4 +34,11 @@ def setUp(self, mock_open, mock_json_load, mock_safe_load): def test_get_dbt_model_parents(self): result = self._dbt_config_parser._get_dbt_model_parents(MODEL_NAME) - self.assertDictEqual(result, EXPECTED_DBT_MODEL_PARENTS) \ No newline at end of file + self.assertDictEqual(result, EXPECTED_DBT_MODEL_PARENTS) + + def test_generate_dagger_inputs(self): + result_inputs = self._dbt_config_parser.generate_dagger_inputs( + EXPECTED_DBT_MODEL_PARENTS + ) + + self.assertListEqual(result_inputs, EXPECTED_DAGGER_INPUTS) \ No newline at end of file From aa767e8d670b99deac0b6568ccae5fa0a1a4701a Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 11:53:56 +0100 Subject: [PATCH 027/134] added test for generate_dagger_outputs --- .../modules/dbt_config_parser_fixtures.py | 17 ++++++++++++++++- tests/utilities/test_dbt_config_parser.py | 11 ++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index ea6acd3..821c5b8 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -104,4 +104,19 @@ "path": "path2/model2", "type": "s3", }, -] \ No newline at end of file +] + +EXPECTED_DAGGER_OUTPUTS = [ + { + "name": "model1", + "schema": "analytics_engineering", + "table": "model1", + "type": "athena", + }, + { + "bucket": "cho${ENV}-data-lake", + "name": "model1", + "relative_s3_path": "path1/model1", + "type": "s3", + }, +] diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index a882760..a09bbc4 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -41,4 +41,13 @@ def test_generate_dagger_inputs(self): EXPECTED_DBT_MODEL_PARENTS ) - self.assertListEqual(result_inputs, EXPECTED_DAGGER_INPUTS) \ No newline at end of file + self.assertListEqual(result_inputs, EXPECTED_DAGGER_INPUTS) + + def test_generate_dagger_outputs(self): + result_outputs = self._dbt_config_parser.generate_dagger_outputs( + EXPECTED_DBT_MODEL_PARENTS["model_name"], + EXPECTED_DBT_MODEL_PARENTS["schema"], + EXPECTED_DBT_MODEL_PARENTS["relative_s3_path"], + ) + + self.assertListEqual(result_outputs, EXPECTED_DAGGER_OUTPUTS) From 8cecbe67f0584758e2369250be74726580526145 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 12:07:27 +0100 Subject: [PATCH 028/134] fixed type hint --- dagger/utilities/dbt_config_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 12f5e92..afc011a 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -29,7 +29,7 @@ def __init__(self, default_config_parameters: dict): "s3_data_dir" ) or prod_dbt_profile.get("s3_staging_dir") - def generate_io(self, model_name: str) -> tuple[list[dict], list[dict]]: + def generate_io(self, model_name: str) -> Tuple[list[dict], list[dict]]: """ Generates the dagger inputs and outputs for the respective dbt model Args: From 04eafe6aa68154d8afe80d2e8c8d6f782e3382f0 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 16 Nov 2023 12:20:13 +0100 Subject: [PATCH 029/134] fixed type hints --- dagger/utilities/dbt_config_parser.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index afc011a..4d444b8 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -1,6 +1,7 @@ from os import path from os.path import join -from typing import Union, Tuple +from typing import Union, Tuple, List, Dict + import json import yaml @@ -8,6 +9,7 @@ S3_IO_BASE = {"type": "s3"} + class DBTConfigParser: """ Module that parses the manifest.json file generated by dbt and generates the dagger inputs and outputs for the respective dbt model @@ -29,14 +31,14 @@ def __init__(self, default_config_parameters: dict): "s3_data_dir" ) or prod_dbt_profile.get("s3_staging_dir") - def generate_io(self, model_name: str) -> Tuple[list[dict], list[dict]]: + def generate_io(self, model_name: str) -> Tuple[List[Dict], List[Dict]]: """ Generates the dagger inputs and outputs for the respective dbt model Args: model_name: name of the dbt model Returns: - tuple[list[dict], list[dict]]: dagger inputs and outputs for the respective dbt model + tuple[List[Dict], List[Dict]]: dagger inputs and outputs for the respective dbt model """ model_parents = self._get_dbt_model_parents(model_name) @@ -70,7 +72,7 @@ def parse_dbt_staging_model(self, dbt_staging_model: str) -> Tuple[str, str]: def generate_dagger_inputs( self, dbt_model_parents: dict - ) -> Union[list[dict], None]: + ) -> Union[List[Dict], None]: """ Generates the dagger inputs for the respective dbt model. This means that all parents of the dbt model are added as dagger inputs. Staging models are added as Athena inputs and core models are added as Athena and S3 inputs. @@ -79,7 +81,7 @@ def generate_dagger_inputs( dbt_model_parents: All parents of the dbt model Returns: - Union[list[dict], None]: dagger inputs for the respective dbt model. If there are no parents, returns None + Union[List[Dict], None]: dagger inputs for the respective dbt model. If there are no parents, returns None """ dagger_inputs = [] @@ -111,7 +113,7 @@ def generate_dagger_inputs( def generate_dagger_outputs( self, model_name: str, schema: str, relative_s3_path: str - ) -> list[dict]: + ) -> List[Dict]: """ Generates the dagger outputs for the respective dbt model. This means that an Athena and S3 output is added for the dbt model. @@ -121,7 +123,7 @@ def generate_dagger_outputs( relative_s3_path: The S3 path of the dbt model relative to the data bucket Returns: - list[dict]: dagger S3 and Athena outputs for the respective dbt model + List[Dict]: dagger S3 and Athena outputs for the respective dbt model """ athena_input = ATHENA_IO_BASE.copy() From 745311dd57d85de980eea6d643816d6a2d627dcf Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 17 Nov 2023 21:18:40 +0100 Subject: [PATCH 030/134] refactored for simplicity --- dagger/utilities/dbt_config_parser.py | 186 ++++++++------------------ dagger/utilities/module.py | 2 +- 2 files changed, 56 insertions(+), 132 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index afc011a..d811650 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -1,11 +1,13 @@ +import json from os import path from os.path import join -from typing import Union, Tuple -import json +from pprint import pprint +from typing import Tuple, List, Dict + import yaml -ATHENA_IO_BASE = {"type": "athena"} -S3_IO_BASE = {"type": "s3"} +ATHENA_TASK_BASE = {"type": "athena"} +S3_TASK_BASE = {"type": "s3"} class DBTConfigParser: @@ -24,116 +26,58 @@ def __init__(self, default_config_parameters: dict): data = f.read() self._manifest_data = json.loads(data) profile_yaml = yaml.safe_load(open(dbt_profile_path, "r")) - prod_dbt_profile = profile_yaml[self._dbt_project_dir.split("/")[-1]]["outputs"]["data"] + prod_dbt_profile = profile_yaml[self._dbt_project_dir.split("/")[-1]][ + "outputs" + ]["data"] self._default_data_dir = prod_dbt_profile.get( "s3_data_dir" ) or prod_dbt_profile.get("s3_staging_dir") - def generate_io(self, model_name: str) -> Tuple[list[dict], list[dict]]: + def _generate_dagger_dependency(self, node: dict) -> List[Dict]: """ - Generates the dagger inputs and outputs for the respective dbt model + Generates the dagger task based on whether the DBT model node is a staging model or not. + If the DBT model node represents a staging model, then a dagger athena task is generated for each source of the DBT model. + If the DBT model node is not a staging model, then a dagger athena task and an s3 task is generated for the DBT model node itself. Args: - model_name: name of the dbt model - - Returns: - tuple[list[dict], list[dict]]: dagger inputs and outputs for the respective dbt model - - """ - model_parents = self._get_dbt_model_parents(model_name) - model_dagger_inputs = self.generate_dagger_inputs(model_parents) - model_dagger_outputs = self.generate_dagger_outputs( - model_parents["model_name"], - model_parents["schema"], - model_parents["relative_s3_path"], - ) - - return model_dagger_inputs, model_dagger_outputs - - def parse_dbt_staging_model(self, dbt_staging_model: str) -> Tuple[str, str]: - """ - Parses the dbt staging model to get the core schema and table name - Args: - dbt_staging_model: name of the DBT staging model + node: The extracted node from the manifest.json file Returns: - Tuple[str, str]: core schema and table name - - >>> parse_dbt_staging_model("schema_name__table") - ('schema_name', 'table') - >>> parse_dbt_staging_model("another_schema__another_table") - ('another_schema', 'another_table') - """ - _model_split, core_table = dbt_staging_model.split("__") - core_schema = _model_split.split("_")[-1] - - return core_schema, core_table + List[Dict]: The respective dagger tasks for the DBT model node - def generate_dagger_inputs( - self, dbt_model_parents: dict - ) -> Union[list[dict], None]: """ - Generates the dagger inputs for the respective dbt model. This means that all parents of the dbt model are added as dagger inputs. - Staging models are added as Athena inputs and core models are added as Athena and S3 inputs. - Intermediate models are not added as an input. - Args: - dbt_model_parents: All parents of the dbt model + model_name = node["name"] - Returns: - Union[list[dict], None]: dagger inputs for the respective dbt model. If there are no parents, returns None + s3_task = S3_TASK_BASE.copy() + dagger_tasks = [] - """ - dagger_inputs = [] - for parent in dbt_model_parents["inputs"]: - model_name = parent["model_name"] - athena_input = ATHENA_IO_BASE.copy() - s3_input = S3_IO_BASE.copy() - - if model_name.startswith("stg_"): - athena_input["name"] = model_name - ( - athena_input["schema"], - athena_input["table"], - ) = self.parse_dbt_staging_model(model_name) - - dagger_inputs.append(athena_input) - else: - athena_input["name"] = athena_input["table"] = model_name - athena_input["schema"] = parent["schema"] - - s3_input["name"] = model_name - s3_input["bucket"] = self._default_data_bucket - s3_input["path"] = parent["relative_s3_path"] - - dagger_inputs.append(athena_input) - dagger_inputs.append(s3_input) - - return dagger_inputs or None - - def generate_dagger_outputs( - self, model_name: str, schema: str, relative_s3_path: str - ) -> list[dict]: - """ - Generates the dagger outputs for the respective dbt model. - This means that an Athena and S3 output is added for the dbt model. - Args: - model_name: The name of the dbt model - schema: The schema of the dbt model - relative_s3_path: The S3 path of the dbt model relative to the data bucket + if model_name.startswith("stg_"): + source_nodes = node.get("depends_on", {}).get("nodes", []) + for source_node in source_nodes: + _, project_name, schema_name, table_name = source_node.split(".") + athena_task = ATHENA_TASK_BASE.copy() - Returns: - list[dict]: dagger S3 and Athena outputs for the respective dbt model + athena_task["name"] = f"stg_{schema_name}__{table_name}" + athena_task["schema"] = schema_name + athena_task["table"] = table_name - """ - athena_input = ATHENA_IO_BASE.copy() - s3_input = S3_IO_BASE.copy() + dagger_tasks.append(athena_task) + else: + athena_task = ATHENA_TASK_BASE.copy() + model_schema = node["schema"] + athena_task["name"] = f"{model_schema}_{model_name}_athena" + athena_task["table"] = model_name + athena_task["schema"] = node["schema"] - athena_input["name"] = athena_input["table"] = s3_input["name"] = model_name - athena_input["schema"] = schema + s3_task["name"] = f"{model_schema}_{model_name}_s3" + s3_task["bucket"] = self._default_data_bucket + s3_task["path"] = self._get_model_data_location( + node, model_schema, model_name + ) - s3_input["bucket"] = "cho${ENV}-data-lake" - s3_input["relative_s3_path"] = relative_s3_path + dagger_tasks.append(athena_task) + dagger_tasks.append(s3_task) - return [athena_input, s3_input] + return dagger_tasks def _get_model_data_location( self, node: dict, schema: str, dbt_model_name: str @@ -148,59 +92,39 @@ def _get_model_data_location( dbt_model_name: The name of the dbt model Returns: - str: The S3 path of the dbt model relative to the data bucket + str: The relative S3 path of the dbt model relative to the data bucket """ location = node.get("config", {}).get("external_location") if not location: location = join(self._default_data_dir, schema, dbt_model_name) - return location.split("data-lake/")[1] + return location.split(self._default_data_bucket)[1].lstrip("/") - def _get_dbt_model_parents(self, model_name: str) -> dict: + def generate_dagger_io(self, model_name: str) -> Tuple[list, list]: """ - Gets all parents of a single dbt model from the manifest.json file + Parse through all the parents of the DBT model and return the dagger inputs and outputs for the DBT model Args: model_name: The name of the DBT model Returns: - dict: All parents of the dbt model along with the name, schema and S3 path of the dbt model itself + Tuple[list, list]: The dagger inputs and outputs for the DBT model """ - inputs_dict = {} inputs_list = [] - dbt_ref_to_model = f"model.{self._dbt_project_dir}.{model_name}" nodes = self._manifest_data["nodes"] - model_info = nodes[f"model.main.{model_name}"] + model_node = nodes[f"model.main.{model_name}"] - parent_node_names = model_info.get("depends_on", {}).get("nodes", []) - parent_model_names = [x.split(".")[-1] for x in parent_node_names] + parent_node_names = model_node.get("depends_on", {}).get("nodes", []) for index, parent_node_name in enumerate(parent_node_names): if not (".int_" in parent_node_name): - parent_model_name = parent_node_name.split(".")[-1] parent_model_node = nodes.get(parent_node_name) - parent_schema = parent_model_node.get("schema") - - model_data_location = self._get_model_data_location( - parent_model_node, parent_schema, parent_model_name - ) - - inputs_list.append( - { - "schema": parent_schema, - "model_name": parent_model_names[index], - "relative_s3_path": model_data_location, - } - ) - - inputs_dict["model_name"] = model_name - inputs_dict["node_name"] = dbt_ref_to_model - inputs_dict["inputs"] = inputs_list - inputs_dict["schema"] = model_info["schema"] - inputs_dict["relative_s3_path"] = self._get_model_data_location( - model_info, model_info["schema"], model_name - ) - - return inputs_dict + dagger_input = self._generate_dagger_dependency(parent_model_node) + + inputs_list += dagger_input + + output_list = self._generate_dagger_dependency(model_node) + + return inputs_list, output_list diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index ee7540d..7b954e3 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -85,7 +85,7 @@ def generate_task_configs(self): task_dict = yaml.safe_load(task_str) if task == 'dbt': - inputs, outputs = self._dbt_module.generate_io(branch_name) + inputs, outputs = self._dbt_module.generate_dagger_io(branch_name) task_dict['inputs'] = inputs task_dict['outputs'] = outputs task_dict['task_parameters']['select'] = branch_name From b52e0b540dc023cd64574c43ad8590629a757e1a Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 17 Nov 2023 21:19:56 +0100 Subject: [PATCH 031/134] updated tests --- .../modules/dbt_config_parser_fixtures.py | 86 ++++++++++++------- tests/utilities/test_dbt_config_parser.py | 54 ++++++++---- 2 files changed, 94 insertions(+), 46 deletions(-) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index 821c5b8..d3ec583 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -25,7 +25,7 @@ "macro.main.macro2", ], "nodes": [ - "model.main.stg_core_schema1__table1", + "model.main.stg_core_schema2__table2", "model.main.model2", "model.main.int_model3", ], @@ -33,14 +33,32 @@ }, "model.main.stg_core_schema1__table1": { "schema": "analytics_engineering", + "name": "stg_core_schema1__table1", + "depends_on": { + "macros": [], + "nodes": ["source.main.core_schema1.table1"], + }, + }, + "model.main.stg_core_schema2__table2": { + "schema": "analytics_engineering", + "name": "stg_core_schema2__table2", + "depends_on": { + "macros": [], + "nodes": [ + "source.main.core_schema2.table2", + "source.main.core_schema2.table3", + ], + }, }, "model.main.model2": { + "name": "model2", "schema": "analytics_engineering", "config": { "external_location": "s3://bucket1-data-lake/path2/model2", }, }, "model.main.int_model3": { + "name": "int_model3", "schema": "analytics_engineering", }, } @@ -66,41 +84,51 @@ } } -EXPECTED_DBT_MODEL_PARENTS = { - "inputs": [ - { - "model_name": "stg_core_schema1__table1", - "relative_s3_path": "path1/tmp/analytics_engineering/stg_core_schema1__table1", - "schema": "analytics_engineering", - }, - { - "model_name": "model2", - "relative_s3_path": "path2/model2", - "schema": "analytics_engineering", - }, - ], - "model_name": "model1", - "node_name": "model.main.model1", - "relative_s3_path": "path1/model1", - "schema": "analytics_engineering", -} - -EXPECTED_DAGGER_INPUTS = [ +EXPECTED_STAGING_NODE = [ { + "type": "athena", "name": "stg_core_schema1__table1", - "schema": "schema1", + "schema": "core_schema1", "table": "table1", + } +] +EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES = [ + { + "type": "athena", + "name": "stg_core_schema2__table2", + "schema": "core_schema2", + "table": "table2", + }, + { + "type": "athena", + "name": "stg_core_schema2__table3", + "schema": "core_schema2", + "table": "table3", + }, +] + +EXPECTED_DAGGER_INPUTS = [ + { + "name": "stg_core_schema2__table2", + "schema": "core_schema2", + "table": "table2", "type": "athena", }, { - "name": "model2", + "name": "stg_core_schema2__table3", + "schema": "core_schema2", + "table": "table3", + "type": "athena", + }, + { + "name": "analytics_engineering_model2_athena", "schema": "analytics_engineering", "table": "model2", "type": "athena", }, { "bucket": "bucket1-data-lake", - "name": "model2", + "name": "analytics_engineering_model2_s3", "path": "path2/model2", "type": "s3", }, @@ -108,15 +136,15 @@ EXPECTED_DAGGER_OUTPUTS = [ { - "name": "model1", + "name": "analytics_engineering_fct_supplier_revenue_athena", "schema": "analytics_engineering", - "table": "model1", + "table": "fct_supplier_revenue", "type": "athena", }, { - "bucket": "cho${ENV}-data-lake", - "name": "model1", - "relative_s3_path": "path1/model1", + "bucket": "bucket1-data-lake", + "name": "analytics_engineering_fct_supplier_revenue_s3", + "path": "path1/model1", "type": "s3", }, ] diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index a09bbc4..e05fe4e 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -6,11 +6,12 @@ from dagger.utilities.dbt_config_parser import DBTConfigParser from dagger.utilities.module import Module from tests.fixtures.modules.dbt_config_parser_fixtures import ( - EXPECTED_DBT_MODEL_PARENTS, + EXPECTED_DAGGER_OUTPUTS, EXPECTED_DAGGER_INPUTS, DBT_MANIFEST_FILE_FIXTURE, DBT_PROFILE_FIXTURE, - EXPECTED_DAGGER_OUTPUTS, + EXPECTED_STAGING_NODE, + EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, ) _logger = logging.getLogger("root") @@ -29,25 +30,44 @@ class TestDBTConfigParser(unittest.TestCase): @patch("yaml.safe_load", return_value=DBT_PROFILE_FIXTURE) def setUp(self, mock_open, mock_json_load, mock_safe_load): self._dbt_config_parser = DBTConfigParser(DEFAULT_CONFIG_PARAMS) + self._sample_dbt_node = DBT_MANIFEST_FILE_FIXTURE["nodes"][ + "model.main.stg_core_schema1__table1" + ] + @skip("Run only locally") + def test_generate_task_configs(self): + module = Module( + path_to_config="./tests/fixtures/modules/dbt_test_config.yaml", + target_dir="./tests/fixtures/modules/", + ) - def test_get_dbt_model_parents(self): - result = self._dbt_config_parser._get_dbt_model_parents(MODEL_NAME) + module.generate_task_configs() - self.assertDictEqual(result, EXPECTED_DBT_MODEL_PARENTS) + def test_generate_dagger_dependency(self): + test_inputs = [ + ( + DBT_MANIFEST_FILE_FIXTURE["nodes"][ + "model.main.stg_core_schema1__table1" + ], + EXPECTED_STAGING_NODE, + ), + ( + DBT_MANIFEST_FILE_FIXTURE["nodes"][ + "model.main.stg_core_schema2__table2" + ], + EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, + ), + ] + for mock_input, expected_output in test_inputs: + result = self._dbt_config_parser._generate_dagger_dependency(mock_input) + self.assertListEqual(result, expected_output) - def test_generate_dagger_inputs(self): - result_inputs = self._dbt_config_parser.generate_dagger_inputs( - EXPECTED_DBT_MODEL_PARENTS - ) + def test_generate_io_inputs(self): + result, _ = self._dbt_config_parser.generate_dagger_io(MODEL_NAME) - self.assertListEqual(result_inputs, EXPECTED_DAGGER_INPUTS) + self.assertListEqual(result, EXPECTED_DAGGER_INPUTS) - def test_generate_dagger_outputs(self): - result_outputs = self._dbt_config_parser.generate_dagger_outputs( - EXPECTED_DBT_MODEL_PARENTS["model_name"], - EXPECTED_DBT_MODEL_PARENTS["schema"], - EXPECTED_DBT_MODEL_PARENTS["relative_s3_path"], - ) + def test_generate_io_outputs(self): + _, result = self._dbt_config_parser.generate_dagger_io(MODEL_NAME) - self.assertListEqual(result_outputs, EXPECTED_DAGGER_OUTPUTS) + self.assertListEqual(result, EXPECTED_DAGGER_OUTPUTS) From 6e7ced264e0ae3e65472b56fc810e59947966b7e Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 20 Nov 2023 09:40:26 +0100 Subject: [PATCH 032/134] added dbt profile to default parameters for dbt task --- dagger/utilities/dbt_config_parser.py | 3 ++- tests/utilities/test_dbt_config_parser.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index bb0d332..4725afd 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -17,6 +17,7 @@ class DBTConfigParser: """ def __init__(self, default_config_parameters: dict): + self._dbt_profile = default_config_parameters.get("dbt_profile", "data") self._default_data_bucket = default_config_parameters["data_bucket"] self._dbt_project_dir = default_config_parameters.get("project_dir", None) dbt_manifest_path = path.join(self._dbt_project_dir, "target", "manifest.json") @@ -29,7 +30,7 @@ def __init__(self, default_config_parameters: dict): profile_yaml = yaml.safe_load(open(dbt_profile_path, "r")) prod_dbt_profile = profile_yaml[self._dbt_project_dir.split("/")[-1]][ "outputs" - ]["data"] + ][self._dbt_profile] self._default_data_dir = prod_dbt_profile.get( "s3_data_dir" ) or prod_dbt_profile.get("s3_staging_dir") diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index e05fe4e..34d37a1 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -20,6 +20,7 @@ "data_bucket": "bucket1-data-lake", "project_dir": "main", "profile_dir": ".dbt", + "dbt_profile": "data", } MODEL_NAME = "model1" From e8cbc6f90a9654d283893876a9c0f1537f4b573a Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 20 Nov 2023 20:01:43 +0100 Subject: [PATCH 033/134] added follow external dependency as true as default for athena task --- dagger/utilities/dbt_config_parser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 4725afd..bb06f3a 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -6,11 +6,10 @@ import yaml -ATHENA_TASK_BASE = {"type": "athena"} +ATHENA_TASK_BASE = {"type": "athena", "follow_external_dependency": True} S3_TASK_BASE = {"type": "s3"} - class DBTConfigParser: """ Module that parses the manifest.json file generated by dbt and generates the dagger inputs and outputs for the respective dbt model From e62d35cd054e00b2b7f132da347a58718677da39 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 20 Nov 2023 20:03:04 +0100 Subject: [PATCH 034/134] add fn to process seed input --- dagger/utilities/dbt_config_parser.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index bb06f3a..73fac6b 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -34,6 +34,22 @@ def __init__(self, default_config_parameters: dict): "s3_data_dir" ) or prod_dbt_profile.get("s3_staging_dir") + def _process_seed_input(self, seed_node: dict) -> dict: + """ + Generates a dummy dagger task for the DBT seed node + Args: + seed_node: The extracted seed node from the manifest.json file + + Returns: + dict: The dummy dagger task for the DBT seed node + + """ + task = {} + task["name"] = seed_node.get("name", "") + task["type"] = "dummy" + + return task + def _generate_dagger_dependency(self, node: dict) -> List[Dict]: """ Generates the dagger task based on whether the DBT model node is a staging model or not. From 8de68213c7506ff4ad3ec4e040c2743efc7f28c3 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 20 Nov 2023 20:03:34 +0100 Subject: [PATCH 035/134] refactor code to incorporate seeds --- dagger/utilities/dbt_config_parser.py | 40 +++++++++++++++++---------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 73fac6b..caa3f19 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -33,6 +33,10 @@ def __init__(self, default_config_parameters: dict): self._default_data_dir = prod_dbt_profile.get( "s3_data_dir" ) or prod_dbt_profile.get("s3_staging_dir") + self._default_schema = prod_dbt_profile.get("schema") + + self._nodes_in_manifest = self._manifest_data["nodes"] + self._sources_in_manifest = self._manifest_data["sources"] def _process_seed_input(self, seed_node: dict) -> dict: """ @@ -67,23 +71,30 @@ def _generate_dagger_dependency(self, node: dict) -> List[Dict]: s3_task = S3_TASK_BASE.copy() dagger_tasks = [] - if model_name.startswith("stg_"): - source_nodes = node.get("depends_on", {}).get("nodes", []) - for source_node in source_nodes: - _, project_name, schema_name, table_name = source_node.split(".") - athena_task = ATHENA_TASK_BASE.copy() - - athena_task["name"] = f"stg_{schema_name}__{table_name}" - athena_task["schema"] = schema_name - athena_task["table"] = table_name - - dagger_tasks.append(athena_task) + if node.get("resource_type") == "seed": + task = self._process_seed_input(node) + dagger_tasks.append(task) + elif model_name.startswith("stg_"): + source_node_names = node.get("depends_on", {}).get("nodes", []) + for source_node_name in source_node_names: + if source_node_name.startswith("seed"): + source_node = self._nodes_in_manifest[source_node_name] + task = self._process_seed_input(source_node) + else: + source_node = self._sources_in_manifest[source_node_name] + task = ATHENA_TASK_BASE.copy() + + task["schema"] = source_node.get("schema", self._default_schema) + task["table"] = source_node.get("name", "") + task["name"] = f"stg_{task['schema']}__{task['table']}" + + dagger_tasks.append(task) else: athena_task = ATHENA_TASK_BASE.copy() model_schema = node["schema"] athena_task["name"] = f"{model_schema}_{model_name}_athena" athena_task["table"] = model_name - athena_task["schema"] = node["schema"] + athena_task["schema"] = node.get("schema", self._default_schema) s3_task["name"] = f"{model_schema}_{model_name}_s3" s3_task["bucket"] = self._default_data_bucket @@ -130,14 +141,13 @@ def generate_dagger_io(self, model_name: str) -> Tuple[list, list]: """ inputs_list = [] - nodes = self._manifest_data["nodes"] - model_node = nodes[f"model.main.{model_name}"] + model_node = self._nodes_in_manifest[f"model.main.{model_name}"] parent_node_names = model_node.get("depends_on", {}).get("nodes", []) for index, parent_node_name in enumerate(parent_node_names): if not (".int_" in parent_node_name): - parent_model_node = nodes.get(parent_node_name) + parent_model_node = self._nodes_in_manifest.get(parent_node_name) dagger_input = self._generate_dagger_dependency(parent_model_node) inputs_list += dagger_input From 5f26723d449b606ceaf09781c5f8ec418deac3c5 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 20 Nov 2023 20:03:47 +0100 Subject: [PATCH 036/134] updates fixtures --- .../modules/dbt_config_parser_fixtures.py | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index d3ec583..ff17e07 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -61,7 +61,44 @@ "name": "int_model3", "schema": "analytics_engineering", }, - } + "seed.main.seed_buyer_country_overwrite": { + "database": "awsdatacatalog", + "schema": "analytics_engineering", + "name": "seed_buyer_country_overwrite", + "resource_type": "seed", + "alias": "seed_buyer_country_overwrite", + "tags": ["analytics"], + "description": "", + "created_at": 1700216177.105391, + "depends_on": {"macros": []}, + }, + }, + "sources": { + "source.main.core_schema1.table1": { + "source_name": "table1", + "database": "awsdatacatalog", + "schema": "core_schema1", + "name": "table1", + "tags": ["analytics"], + "description": "", + }, + "source.main.core_schema2.table2": { + "source_name": "table2", + "database": "awsdatacatalog", + "schema": "core_schema2", + "name": "table2", + "tags": ["analytics"], + "description": "", + }, + "source.main.core_schema2.table3": { + "source_name": "table3", + "database": "awsdatacatalog", + "schema": "core_schema2", + "name": "table3", + "tags": ["analytics"], + "description": "", + }, + }, } DBT_PROFILE_FIXTURE = { @@ -90,6 +127,7 @@ "name": "stg_core_schema1__table1", "schema": "core_schema1", "table": "table1", + "follow_external_dependency": True, } ] EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES = [ @@ -98,12 +136,14 @@ "name": "stg_core_schema2__table2", "schema": "core_schema2", "table": "table2", + "follow_external_dependency": True, }, { "type": "athena", "name": "stg_core_schema2__table3", "schema": "core_schema2", "table": "table3", + "follow_external_dependency": True, }, ] @@ -113,18 +153,21 @@ "schema": "core_schema2", "table": "table2", "type": "athena", + "follow_external_dependency": True, }, { "name": "stg_core_schema2__table3", "schema": "core_schema2", "table": "table3", "type": "athena", + "follow_external_dependency": True, }, { "name": "analytics_engineering_model2_athena", "schema": "analytics_engineering", "table": "model2", "type": "athena", + "follow_external_dependency": True, }, { "bucket": "bucket1-data-lake", @@ -140,6 +183,7 @@ "schema": "analytics_engineering", "table": "fct_supplier_revenue", "type": "athena", + "follow_external_dependency": True, }, { "bucket": "bucket1-data-lake", From 9acaff5de7b454d1a6487b7d6cd1a6661342c310 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 20 Nov 2023 20:13:47 +0100 Subject: [PATCH 037/134] added test for dbt seed --- tests/utilities/test_dbt_config_parser.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 34d37a1..d9a85d3 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -12,6 +12,7 @@ DBT_PROFILE_FIXTURE, EXPECTED_STAGING_NODE, EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, + EXPECTED_SEED_NODE, ) _logger = logging.getLogger("root") @@ -58,6 +59,12 @@ def test_generate_dagger_dependency(self): ], EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, ), + ( + DBT_MANIFEST_FILE_FIXTURE["nodes"][ + "seed.main.seed_buyer_country_overwrite" + ], + EXPECTED_SEED_NODE, + ), ] for mock_input, expected_output in test_inputs: result = self._dbt_config_parser._generate_dagger_dependency(mock_input) From 876010c9d49910f1c9a263f5e112e60d1b15922e Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 20 Nov 2023 20:14:09 +0100 Subject: [PATCH 038/134] modified tests for model containing dbt seed as a dependency --- .../fixtures/modules/dbt_config_parser_fixtures.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index ff17e07..3033def 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -47,6 +47,7 @@ "nodes": [ "source.main.core_schema2.table2", "source.main.core_schema2.table3", + "seed.main.seed_buyer_country_overwrite", ], }, }, @@ -145,6 +146,17 @@ "table": "table3", "follow_external_dependency": True, }, + { + "type": "dummy", + "name": "seed_buyer_country_overwrite", + }, +] + +EXPECTED_SEED_NODE = [ + { + "type": "dummy", + "name": "seed_buyer_country_overwrite", + } ] EXPECTED_DAGGER_INPUTS = [ @@ -162,6 +174,7 @@ "type": "athena", "follow_external_dependency": True, }, + {"name": "seed_buyer_country_overwrite", "type": "dummy"}, { "name": "analytics_engineering_model2_athena", "schema": "analytics_engineering", From 9cd0a51b615809975e29942e03ca1234f0bb2dc7 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 24 Nov 2023 12:52:44 +0100 Subject: [PATCH 039/134] refactor * deduplicate list of input dictionaries * created functions that generate the seed input and athena and s3 tasks * removed the follow_external_dependency as true as default for all athena inputs --- dagger/utilities/dbt_config_parser.py | 79 ++++++++++++++++++--------- 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index caa3f19..d3ce6c7 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -1,12 +1,12 @@ import json +from collections import OrderedDict from os import path from os.path import join -from pprint import pprint from typing import Tuple, List, Dict import yaml -ATHENA_TASK_BASE = {"type": "athena", "follow_external_dependency": True} +ATHENA_TASK_BASE = {"type": "athena"} S3_TASK_BASE = {"type": "s3"} @@ -38,7 +38,7 @@ def __init__(self, default_config_parameters: dict): self._nodes_in_manifest = self._manifest_data["nodes"] self._sources_in_manifest = self._manifest_data["sources"] - def _process_seed_input(self, seed_node: dict) -> dict: + def _generate_seed_input(self, seed_node: dict) -> dict: """ Generates a dummy dagger task for the DBT seed node Args: @@ -54,7 +54,39 @@ def _process_seed_input(self, seed_node: dict) -> dict: return task - def _generate_dagger_dependency(self, node: dict) -> List[Dict]: + def _get_athena_task( + self, node: dict, follow_external_dependency: bool = False + ) -> dict: + node_name = node.get("unique_id", "") + + task = ATHENA_TASK_BASE.copy() + if follow_external_dependency: + task["follow_external_dependency"] = True + + task["schema"] = node.get("schema", self._default_schema) + task["table"] = node.get("name", "") + task["name"] = f"{task['schema']}__{task['table']}_athena" + + return task + + def _get_s3_task(self, node: dict) -> dict: + task = S3_TASK_BASE.copy() + + schema = node.get("schema", self._default_schema) + table = node.get("name", "") + task["name"] = f"{schema}__{table}_s3" + task["bucket"] = self._default_data_bucket + task["path"] = self._get_model_data_location(node, schema, table) + + return task + + def _generate_dagger_output(self, node: dict): + return [self._get_athena_task(node), self._get_s3_task(node)] + + def _generate_dagger_inputs( + self, + node: dict, + ) -> List[Dict]: """ Generates the dagger task based on whether the DBT model node is a staging model or not. If the DBT model node represents a staging model, then a dagger athena task is generated for each source of the DBT model. @@ -67,40 +99,27 @@ def _generate_dagger_dependency(self, node: dict) -> List[Dict]: """ model_name = node["name"] - - s3_task = S3_TASK_BASE.copy() dagger_tasks = [] if node.get("resource_type") == "seed": - task = self._process_seed_input(node) + task = self._generate_seed_input(node) dagger_tasks.append(task) elif model_name.startswith("stg_"): source_node_names = node.get("depends_on", {}).get("nodes", []) for source_node_name in source_node_names: if source_node_name.startswith("seed"): source_node = self._nodes_in_manifest[source_node_name] - task = self._process_seed_input(source_node) + task = self._generate_seed_input(source_node) else: source_node = self._sources_in_manifest[source_node_name] - task = ATHENA_TASK_BASE.copy() - - task["schema"] = source_node.get("schema", self._default_schema) - task["table"] = source_node.get("name", "") - task["name"] = f"stg_{task['schema']}__{task['table']}" + task = self._get_athena_task( + source_node, follow_external_dependency=True + ) dagger_tasks.append(task) else: - athena_task = ATHENA_TASK_BASE.copy() - model_schema = node["schema"] - athena_task["name"] = f"{model_schema}_{model_name}_athena" - athena_task["table"] = model_name - athena_task["schema"] = node.get("schema", self._default_schema) - - s3_task["name"] = f"{model_schema}_{model_name}_s3" - s3_task["bucket"] = self._default_data_bucket - s3_task["path"] = self._get_model_data_location( - node, model_schema, model_name - ) + athena_task = self._get_athena_task(node, follow_external_dependency=True) + s3_task = self._get_s3_task(node) dagger_tasks.append(athena_task) dagger_tasks.append(s3_task) @@ -148,10 +167,16 @@ def generate_dagger_io(self, model_name: str) -> Tuple[list, list]: for index, parent_node_name in enumerate(parent_node_names): if not (".int_" in parent_node_name): parent_model_node = self._nodes_in_manifest.get(parent_node_name) - dagger_input = self._generate_dagger_dependency(parent_model_node) + dagger_input = self._generate_dagger_inputs(parent_model_node) inputs_list += dagger_input - output_list = self._generate_dagger_dependency(model_node) + output_list = self._generate_dagger_output(model_node) + + unique_inputs = list( + OrderedDict( + (frozenset(item.items()), item) for item in inputs_list + ).values() + ) - return inputs_list, output_list + return unique_inputs, output_list From 2e6abf6f3bd3ebee59ffd6ad29f1aff00c13c3c1 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 24 Nov 2023 12:52:59 +0100 Subject: [PATCH 040/134] updated tests and fixtures --- .../modules/dbt_config_parser_fixtures.py | 34 ++++++++++++------- tests/utilities/test_dbt_config_parser.py | 21 +++++++----- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index 3033def..3e73c0a 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -3,7 +3,8 @@ "model.main.model1": { "database": "awsdatacatalog", "schema": "analytics_engineering", - "name": "fct_supplier_revenue", + "unique_id": "model.main.model1", + "name": "model1", "config": { "external_location": "s3://bucket1-data-lake/path1/model1", "materialized": "incremental", @@ -28,11 +29,13 @@ "model.main.stg_core_schema2__table2", "model.main.model2", "model.main.int_model3", + "seed.main.seed_buyer_country_overwrite", ], }, }, "model.main.stg_core_schema1__table1": { "schema": "analytics_engineering", + "unique_id": "model.main.stg_core_schema1__table1", "name": "stg_core_schema1__table1", "depends_on": { "macros": [], @@ -42,6 +45,7 @@ "model.main.stg_core_schema2__table2": { "schema": "analytics_engineering", "name": "stg_core_schema2__table2", + "unique_id": "model.main.stg_core_schema2__table2", "depends_on": { "macros": [], "nodes": [ @@ -54,17 +58,21 @@ "model.main.model2": { "name": "model2", "schema": "analytics_engineering", + "unique_id": "model.main.model2", "config": { "external_location": "s3://bucket1-data-lake/path2/model2", }, + "depends_on": {"macros": [], "nodes": []}, }, "model.main.int_model3": { "name": "int_model3", + "unique_id": "model.main.int_model3", "schema": "analytics_engineering", }, "seed.main.seed_buyer_country_overwrite": { "database": "awsdatacatalog", "schema": "analytics_engineering", + "unique_id": "seed.main.seed_buyer_country_overwrite", "name": "seed_buyer_country_overwrite", "resource_type": "seed", "alias": "seed_buyer_country_overwrite", @@ -79,6 +87,7 @@ "source_name": "table1", "database": "awsdatacatalog", "schema": "core_schema1", + "unique_id": "source.main.core_schema1.table1", "name": "table1", "tags": ["analytics"], "description": "", @@ -87,6 +96,7 @@ "source_name": "table2", "database": "awsdatacatalog", "schema": "core_schema2", + "unique_id": "source.main.core_schema2.table2", "name": "table2", "tags": ["analytics"], "description": "", @@ -95,6 +105,7 @@ "source_name": "table3", "database": "awsdatacatalog", "schema": "core_schema2", + "unique_id": "source.main.core_schema2.table3", "name": "table3", "tags": ["analytics"], "description": "", @@ -125,7 +136,7 @@ EXPECTED_STAGING_NODE = [ { "type": "athena", - "name": "stg_core_schema1__table1", + "name": "core_schema1__table1_athena", "schema": "core_schema1", "table": "table1", "follow_external_dependency": True, @@ -134,14 +145,14 @@ EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES = [ { "type": "athena", - "name": "stg_core_schema2__table2", + "name": "core_schema2__table2_athena", "schema": "core_schema2", "table": "table2", "follow_external_dependency": True, }, { "type": "athena", - "name": "stg_core_schema2__table3", + "name": "core_schema2__table3_athena", "schema": "core_schema2", "table": "table3", "follow_external_dependency": True, @@ -161,14 +172,14 @@ EXPECTED_DAGGER_INPUTS = [ { - "name": "stg_core_schema2__table2", + "name": "core_schema2__table2_athena", "schema": "core_schema2", "table": "table2", "type": "athena", "follow_external_dependency": True, }, { - "name": "stg_core_schema2__table3", + "name": "core_schema2__table3_athena", "schema": "core_schema2", "table": "table3", "type": "athena", @@ -176,7 +187,7 @@ }, {"name": "seed_buyer_country_overwrite", "type": "dummy"}, { - "name": "analytics_engineering_model2_athena", + "name": "analytics_engineering__model2_athena", "schema": "analytics_engineering", "table": "model2", "type": "athena", @@ -184,7 +195,7 @@ }, { "bucket": "bucket1-data-lake", - "name": "analytics_engineering_model2_s3", + "name": "analytics_engineering__model2_s3", "path": "path2/model2", "type": "s3", }, @@ -192,15 +203,14 @@ EXPECTED_DAGGER_OUTPUTS = [ { - "name": "analytics_engineering_fct_supplier_revenue_athena", + "name": "analytics_engineering__model1_athena", "schema": "analytics_engineering", - "table": "fct_supplier_revenue", + "table": "model1", "type": "athena", - "follow_external_dependency": True, }, { "bucket": "bucket1-data-lake", - "name": "analytics_engineering_fct_supplier_revenue_s3", + "name": "analytics_engineering__model1_s3", "path": "path1/model1", "type": "s3", }, diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index d9a85d3..752eaaa 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -32,9 +32,7 @@ class TestDBTConfigParser(unittest.TestCase): @patch("yaml.safe_load", return_value=DBT_PROFILE_FIXTURE) def setUp(self, mock_open, mock_json_load, mock_safe_load): self._dbt_config_parser = DBTConfigParser(DEFAULT_CONFIG_PARAMS) - self._sample_dbt_node = DBT_MANIFEST_FILE_FIXTURE["nodes"][ - "model.main.stg_core_schema1__table1" - ] + self._sample_dbt_node = DBT_MANIFEST_FILE_FIXTURE["nodes"]["model.main.model1"] @skip("Run only locally") def test_generate_task_configs(self): @@ -45,37 +43,44 @@ def test_generate_task_configs(self): module.generate_task_configs() - def test_generate_dagger_dependency(self): + def test_generate_dagger_inputs(self): test_inputs = [ ( DBT_MANIFEST_FILE_FIXTURE["nodes"][ "model.main.stg_core_schema1__table1" ], EXPECTED_STAGING_NODE, + True, ), ( DBT_MANIFEST_FILE_FIXTURE["nodes"][ "model.main.stg_core_schema2__table2" ], EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, + True, ), ( DBT_MANIFEST_FILE_FIXTURE["nodes"][ "seed.main.seed_buyer_country_overwrite" ], EXPECTED_SEED_NODE, + False, ), ] - for mock_input, expected_output in test_inputs: - result = self._dbt_config_parser._generate_dagger_dependency(mock_input) + for mock_input, expected_output, follow_external_dependency in test_inputs: + result = self._dbt_config_parser._generate_dagger_inputs(mock_input) self.assertListEqual(result, expected_output) def test_generate_io_inputs(self): - result, _ = self._dbt_config_parser.generate_dagger_io(MODEL_NAME) + result, _ = self._dbt_config_parser.generate_dagger_io( + self._sample_dbt_node.get("name") + ) self.assertListEqual(result, EXPECTED_DAGGER_INPUTS) def test_generate_io_outputs(self): - _, result = self._dbt_config_parser.generate_dagger_io(MODEL_NAME) + _, result = self._dbt_config_parser.generate_dagger_io( + self._sample_dbt_node.get("name") + ) self.assertListEqual(result, EXPECTED_DAGGER_OUTPUTS) From c652d56873fecf94051a4345f8386ec65d77f348 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 24 Nov 2023 13:02:58 +0100 Subject: [PATCH 041/134] changed name of seed task generating fn --- dagger/utilities/dbt_config_parser.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index d3ce6c7..dfdcf81 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -38,7 +38,8 @@ def __init__(self, default_config_parameters: dict): self._nodes_in_manifest = self._manifest_data["nodes"] self._sources_in_manifest = self._manifest_data["sources"] - def _generate_seed_input(self, seed_node: dict) -> dict: + @staticmethod + def _generate_seed_task(seed_node: dict) -> dict: """ Generates a dummy dagger task for the DBT seed node Args: @@ -102,14 +103,14 @@ def _generate_dagger_inputs( dagger_tasks = [] if node.get("resource_type") == "seed": - task = self._generate_seed_input(node) + task = self._generate_seed_task(node) dagger_tasks.append(task) elif model_name.startswith("stg_"): source_node_names = node.get("depends_on", {}).get("nodes", []) for source_node_name in source_node_names: if source_node_name.startswith("seed"): source_node = self._nodes_in_manifest[source_node_name] - task = self._generate_seed_input(source_node) + task = self._generate_seed_task(source_node) else: source_node = self._sources_in_manifest[source_node_name] task = self._get_athena_task( From ac5f48f1e0615e3310602128dfc231a86cd0b24a Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 24 Nov 2023 13:03:05 +0100 Subject: [PATCH 042/134] removed unused line --- dagger/utilities/dbt_config_parser.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index dfdcf81..9f1a2a0 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -58,8 +58,6 @@ def _generate_seed_task(seed_node: dict) -> dict: def _get_athena_task( self, node: dict, follow_external_dependency: bool = False ) -> dict: - node_name = node.get("unique_id", "") - task = ATHENA_TASK_BASE.copy() if follow_external_dependency: task["follow_external_dependency"] = True From 46a833ded562622b1f0768482606ceb09001066e Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 27 Nov 2023 13:16:06 +0100 Subject: [PATCH 043/134] added docstrings --- dagger/utilities/dbt_config_parser.py | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 9f1a2a0..1aa6b4e 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -58,6 +58,16 @@ def _generate_seed_task(seed_node: dict) -> dict: def _get_athena_task( self, node: dict, follow_external_dependency: bool = False ) -> dict: + """ + Generates the dagger athena task for the DBT model node + Args: + node: The extracted node from the manifest.json file + follow_external_dependency: Whether to follow external airflow dependencies or not + + Returns: + dict: The dagger athena task for the DBT model node + + """ task = ATHENA_TASK_BASE.copy() if follow_external_dependency: task["follow_external_dependency"] = True @@ -69,6 +79,15 @@ def _get_athena_task( return task def _get_s3_task(self, node: dict) -> dict: + """ + Generates the dagger s3 task for the DBT model node + Args: + node: The extracted node from the manifest.json file + + Returns: + dict: The dagger s3 task for the DBT model node + + """ task = S3_TASK_BASE.copy() schema = node.get("schema", self._default_schema) @@ -80,6 +99,15 @@ def _get_s3_task(self, node: dict) -> dict: return task def _generate_dagger_output(self, node: dict): + """ + Generates the dagger output for the DBT model node + Args: + node: The extracted node from the manifest.json file + + Returns: + dict: The dagger output, which is a combination of an athena and s3 task for the DBT model node + + """ return [self._get_athena_task(node), self._get_s3_task(node)] def _generate_dagger_inputs( From cdd0fea29c0d6154232d4ae52963e0f25a037848 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 27 Nov 2023 13:17:09 +0100 Subject: [PATCH 044/134] removed unused test parameter --- tests/utilities/test_dbt_config_parser.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 752eaaa..76ecd30 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -50,24 +50,21 @@ def test_generate_dagger_inputs(self): "model.main.stg_core_schema1__table1" ], EXPECTED_STAGING_NODE, - True, ), ( DBT_MANIFEST_FILE_FIXTURE["nodes"][ "model.main.stg_core_schema2__table2" ], EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, - True, ), ( DBT_MANIFEST_FILE_FIXTURE["nodes"][ "seed.main.seed_buyer_country_overwrite" ], EXPECTED_SEED_NODE, - False, ), ] - for mock_input, expected_output, follow_external_dependency in test_inputs: + for mock_input, expected_output in test_inputs: result = self._dbt_config_parser._generate_dagger_inputs(mock_input) self.assertListEqual(result, expected_output) From 829ac94eb03bddd45d94bff9cf010eeb2bd16eb4 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 27 Nov 2023 14:30:17 +0100 Subject: [PATCH 045/134] changed name of function for better understanding --- dagger/utilities/dbt_config_parser.py | 4 ++-- tests/utilities/test_dbt_config_parser.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 1aa6b4e..28d1761 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -110,7 +110,7 @@ def _generate_dagger_output(self, node: dict): """ return [self._get_athena_task(node), self._get_s3_task(node)] - def _generate_dagger_inputs( + def _generate_dagger_tasks( self, node: dict, ) -> List[Dict]: @@ -194,7 +194,7 @@ def generate_dagger_io(self, model_name: str) -> Tuple[list, list]: for index, parent_node_name in enumerate(parent_node_names): if not (".int_" in parent_node_name): parent_model_node = self._nodes_in_manifest.get(parent_node_name) - dagger_input = self._generate_dagger_inputs(parent_model_node) + dagger_input = self._generate_dagger_tasks(parent_model_node) inputs_list += dagger_input diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 76ecd30..7df0a66 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -65,7 +65,7 @@ def test_generate_dagger_inputs(self): ), ] for mock_input, expected_output in test_inputs: - result = self._dbt_config_parser._generate_dagger_inputs(mock_input) + result = self._dbt_config_parser._generate_dagger_tasks(mock_input) self.assertListEqual(result, expected_output) def test_generate_io_inputs(self): From eaf42865b48c6ad2bd264eefd77e5655fbd0d8ea Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 27 Nov 2023 14:30:47 +0100 Subject: [PATCH 046/134] added test to check for de-duplication of inputs --- .../modules/dbt_config_parser_fixtures.py | 51 +++++++++++++++++++ tests/utilities/test_dbt_config_parser.py | 17 +++++-- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index 3e73c0a..84e8a81 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -81,6 +81,23 @@ "created_at": 1700216177.105391, "depends_on": {"macros": []}, }, + "model.main.model3": { + "name": "model3", + "schema": "analytics_engineering", + "unique_id": "model.main.model3", + "config": { + "external_location": "s3://bucket1-data-lake/path2/model3", + }, + "depends_on": { + "macros": [], + "nodes": [ + "model.main.int_model3", + "model.main.model2", + "seed.main.seed_buyer_country_overwrite", + "model.main.stg_core_schema2__table2", + ], + }, + }, }, "sources": { "source.main.core_schema1.table1": { @@ -170,6 +187,40 @@ } ] +EXPECTED_MODEL_MULTIPLE_DEPENDENCIES = [ + { + "type": "athena", + "name": "analytics_engineering__model2_athena", + "schema": "analytics_engineering", + "table": "model2", + "follow_external_dependency": True, + }, + { + "bucket": "bucket1-data-lake", + "name": "analytics_engineering__model2_s3", + "path": "path2/model2", + "type": "s3", + }, + { + "type": "dummy", + "name": "seed_buyer_country_overwrite", + }, + { + "type": "athena", + "name": "core_schema2__table2_athena", + "schema": "core_schema2", + "table": "table2", + "follow_external_dependency": True, + }, + { + "type": "athena", + "name": "core_schema2__table3_athena", + "schema": "core_schema2", + "table": "table3", + "follow_external_dependency": True, + }, +] + EXPECTED_DAGGER_INPUTS = [ { "name": "core_schema2__table2_athena", diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 7df0a66..ecc7fc7 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -13,6 +13,7 @@ EXPECTED_STAGING_NODE, EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, EXPECTED_SEED_NODE, + EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, ) _logger = logging.getLogger("root") @@ -69,11 +70,19 @@ def test_generate_dagger_inputs(self): self.assertListEqual(result, expected_output) def test_generate_io_inputs(self): - result, _ = self._dbt_config_parser.generate_dagger_io( - self._sample_dbt_node.get("name") - ) + fixtures = [ + ("model1", EXPECTED_DAGGER_INPUTS), + ( + "model3", + EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, + ), + ] + for mock_input, expected_output in fixtures: + result, _ = self._dbt_config_parser.generate_dagger_io( + mock_input + ) - self.assertListEqual(result, EXPECTED_DAGGER_INPUTS) + self.assertListEqual(result, expected_output) def test_generate_io_outputs(self): _, result = self._dbt_config_parser.generate_dagger_io( From ab9a2c892a227ec918a32162a6224e2fc5705adc Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 29 Nov 2023 11:28:59 +0100 Subject: [PATCH 047/134] refactored getting model location function this was done because the bucket name in the main module config can be different for how the manifest file is compiled --- dagger/utilities/dbt_config_parser.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 28d1761..4cd71d7 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -94,7 +94,7 @@ def _get_s3_task(self, node: dict) -> dict: table = node.get("name", "") task["name"] = f"{schema}__{table}_s3" task["bucket"] = self._default_data_bucket - task["path"] = self._get_model_data_location(node, schema, table) + task["path"] = self._get_model_data_location(node, schema, table)[1] return task @@ -155,7 +155,7 @@ def _generate_dagger_tasks( def _get_model_data_location( self, node: dict, schema: str, dbt_model_name: str - ) -> str: + ) -> Tuple[str, str]: """ Gets the S3 path of the dbt model relative to the data bucket. If external location is not specified in the DBT model config, then the default data directory from the @@ -173,7 +173,10 @@ def _get_model_data_location( if not location: location = join(self._default_data_dir, schema, dbt_model_name) - return location.split(self._default_data_bucket)[1].lstrip("/") + split = location.split("//")[1].split("/") + bucket_name, data_path = split[0], "/".join(split[1:]) + + return bucket_name, data_path def generate_dagger_io(self, model_name: str) -> Tuple[list, list]: """ From 8ef70503dd7302d7abe4c3dacfea0297efa06c59 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 29 Nov 2023 11:30:14 +0100 Subject: [PATCH 048/134] refactor dummy task generation --- dagger/utilities/dbt_config_parser.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 4cd71d7..d9fe1dd 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -39,20 +39,23 @@ def __init__(self, default_config_parameters: dict): self._sources_in_manifest = self._manifest_data["sources"] @staticmethod - def _generate_seed_task(seed_node: dict) -> dict: + def _get_dummy_task(node: dict, follow_external_dependency: bool = False) -> dict: """ - Generates a dummy dagger task for the DBT seed node + Generates a dummy dagger task Args: - seed_node: The extracted seed node from the manifest.json file + node: The extracted node from the manifest.json file Returns: - dict: The dummy dagger task for the DBT seed node + dict: The dummy dagger task for the DBT node """ task = {} - task["name"] = seed_node.get("name", "") + task["name"] = node.get("name", "") task["type"] = "dummy" + if follow_external_dependency: + task["follow_external_dependency"] = True + return task def _get_athena_task( @@ -116,6 +119,7 @@ def _generate_dagger_tasks( ) -> List[Dict]: """ Generates the dagger task based on whether the DBT model node is a staging model or not. + If the DBT model node represents a DBT seed or an ephemeral model, then a dagger dummy task is generated. If the DBT model node represents a staging model, then a dagger athena task is generated for each source of the DBT model. If the DBT model node is not a staging model, then a dagger athena task and an s3 task is generated for the DBT model node itself. Args: @@ -129,14 +133,17 @@ def _generate_dagger_tasks( dagger_tasks = [] if node.get("resource_type") == "seed": - task = self._generate_seed_task(node) + task = self._get_dummy_task(node) + dagger_tasks.append(task) + elif node.get("config",{}).get("materialized") == "ephemeral": + task = self._get_dummy_task(node, follow_external_dependency=True) dagger_tasks.append(task) elif model_name.startswith("stg_"): source_node_names = node.get("depends_on", {}).get("nodes", []) for source_node_name in source_node_names: if source_node_name.startswith("seed"): source_node = self._nodes_in_manifest[source_node_name] - task = self._generate_seed_task(source_node) + task = self._get_dummy_task(source_node) else: source_node = self._sources_in_manifest[source_node_name] task = self._get_athena_task( From 65a07d76877734177805ced9fffc1ba0e9965471 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 29 Nov 2023 11:31:12 +0100 Subject: [PATCH 049/134] generate inputs for intermediate models and updated tests --- dagger/utilities/dbt_config_parser.py | 7 ++--- .../modules/dbt_config_parser_fixtures.py | 28 +++++++++++++++++++ tests/utilities/test_dbt_config_parser.py | 11 +++++--- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index d9fe1dd..a4390fb 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -202,11 +202,10 @@ def generate_dagger_io(self, model_name: str) -> Tuple[list, list]: parent_node_names = model_node.get("depends_on", {}).get("nodes", []) for index, parent_node_name in enumerate(parent_node_names): - if not (".int_" in parent_node_name): - parent_model_node = self._nodes_in_manifest.get(parent_node_name) - dagger_input = self._generate_dagger_tasks(parent_model_node) + parent_model_node = self._nodes_in_manifest.get(parent_node_name) + dagger_input = self._generate_dagger_tasks(parent_model_node) - inputs_list += dagger_input + inputs_list += dagger_input output_list = self._generate_dagger_output(model_node) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index 84e8a81..ab887d4 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -37,6 +37,9 @@ "schema": "analytics_engineering", "unique_id": "model.main.stg_core_schema1__table1", "name": "stg_core_schema1__table1", + "config": { + "materialized": "view", + }, "depends_on": { "macros": [], "nodes": ["source.main.core_schema1.table1"], @@ -46,6 +49,9 @@ "schema": "analytics_engineering", "name": "stg_core_schema2__table2", "unique_id": "model.main.stg_core_schema2__table2", + "config": { + "materialized": "view", + }, "depends_on": { "macros": [], "nodes": [ @@ -61,6 +67,7 @@ "unique_id": "model.main.model2", "config": { "external_location": "s3://bucket1-data-lake/path2/model2", + "materialized": "table", }, "depends_on": {"macros": [], "nodes": []}, }, @@ -68,6 +75,9 @@ "name": "int_model3", "unique_id": "model.main.int_model3", "schema": "analytics_engineering", + "config": { + "materialized": "ephemeral", + }, }, "seed.main.seed_buyer_country_overwrite": { "database": "awsdatacatalog", @@ -188,6 +198,11 @@ ] EXPECTED_MODEL_MULTIPLE_DEPENDENCIES = [ + { + "type": "dummy", + "name": "int_model3", + "follow_external_dependency": True, + }, { "type": "athena", "name": "analytics_engineering__model2_athena", @@ -221,6 +236,14 @@ }, ] +EXPECTED_EPHEMERAL_NODE = [ + { + "type": "dummy", + "name": "int_model3", + "follow_external_dependency": True, + } +] + EXPECTED_DAGGER_INPUTS = [ { "name": "core_schema2__table2_athena", @@ -250,6 +273,11 @@ "path": "path2/model2", "type": "s3", }, + { + "type": "dummy", + "name": "int_model3", + "follow_external_dependency": True, + }, ] EXPECTED_DAGGER_OUTPUTS = [ diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index ecc7fc7..549e41c 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -14,6 +14,7 @@ EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, EXPECTED_SEED_NODE, EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, + EXPECTED_EPHEMERAL_NODE, ) _logger = logging.getLogger("root") @@ -35,7 +36,7 @@ def setUp(self, mock_open, mock_json_load, mock_safe_load): self._dbt_config_parser = DBTConfigParser(DEFAULT_CONFIG_PARAMS) self._sample_dbt_node = DBT_MANIFEST_FILE_FIXTURE["nodes"]["model.main.model1"] - @skip("Run only locally") + # @skip("Run only locally") def test_generate_task_configs(self): module = Module( path_to_config="./tests/fixtures/modules/dbt_test_config.yaml", @@ -64,6 +65,10 @@ def test_generate_dagger_inputs(self): ], EXPECTED_SEED_NODE, ), + ( + DBT_MANIFEST_FILE_FIXTURE["nodes"]["model.main.int_model3"], + EXPECTED_EPHEMERAL_NODE, + ), ] for mock_input, expected_output in test_inputs: result = self._dbt_config_parser._generate_dagger_tasks(mock_input) @@ -78,9 +83,7 @@ def test_generate_io_inputs(self): ), ] for mock_input, expected_output in fixtures: - result, _ = self._dbt_config_parser.generate_dagger_io( - mock_input - ) + result, _ = self._dbt_config_parser.generate_dagger_io(mock_input) self.assertListEqual(result, expected_output) From e5afcfa980d93129e20638c4778b5b9c39220408 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 29 Nov 2023 11:32:54 +0100 Subject: [PATCH 050/134] uncomment skipping local test --- tests/utilities/test_dbt_config_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 549e41c..7c3557c 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -36,7 +36,7 @@ def setUp(self, mock_open, mock_json_load, mock_safe_load): self._dbt_config_parser = DBTConfigParser(DEFAULT_CONFIG_PARAMS) self._sample_dbt_node = DBT_MANIFEST_FILE_FIXTURE["nodes"]["model.main.model1"] - # @skip("Run only locally") + @skip("Run only locally") def test_generate_task_configs(self): module = Module( path_to_config="./tests/fixtures/modules/dbt_test_config.yaml", From e28a078a573339aed405187adfc67497cfbf3020 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 29 Nov 2023 15:31:51 +0100 Subject: [PATCH 051/134] refactor generate_dagger_tasks fn to make recursive --- dagger/utilities/dbt_config_parser.py | 45 ++++++++++++++------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index a4390fb..6c80cb4 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -103,7 +103,8 @@ def _get_s3_task(self, node: dict) -> dict: def _generate_dagger_output(self, node: dict): """ - Generates the dagger output for the DBT model node + Generates the dagger output for the DBT model node. If the model is materialized as a view or ephemeral, then a dummy task is created. + Otherwise, an athena and s3 task is created for the DBT model node. Args: node: The extracted node from the manifest.json file @@ -111,16 +112,19 @@ def _generate_dagger_output(self, node: dict): dict: The dagger output, which is a combination of an athena and s3 task for the DBT model node """ - return [self._get_athena_task(node), self._get_s3_task(node)] + if node.get("config", {}).get("materialized") in ("view", "ephemeral"): + return [self._get_dummy_task(node)] + else: + return [self._get_athena_task(node), self._get_s3_task(node)] def _generate_dagger_tasks( self, - node: dict, + node_name: str, ) -> List[Dict]: """ Generates the dagger task based on whether the DBT model node is a staging model or not. If the DBT model node represents a DBT seed or an ephemeral model, then a dagger dummy task is generated. - If the DBT model node represents a staging model, then a dagger athena task is generated for each source of the DBT model. + If the DBT model node represents a staging model, then a dagger athena task is generated for each source of the DBT model. Apart from this, a dummy task is also generated for the staging model itself. If the DBT model node is not a staging model, then a dagger athena task and an s3 task is generated for the DBT model node itself. Args: node: The extracted node from the manifest.json file @@ -129,28 +133,28 @@ def _generate_dagger_tasks( List[Dict]: The respective dagger tasks for the DBT model node """ - model_name = node["name"] dagger_tasks = [] + if node_name.startswith("source"): + node = self._sources_in_manifest[node_name] + else: + node = self._nodes_in_manifest[node_name] + if node.get("resource_type") == "seed": task = self._get_dummy_task(node) dagger_tasks.append(task) - elif node.get("config",{}).get("materialized") == "ephemeral": + elif node.get("resource_type") == 'source': + athena_task = self._get_athena_task(node, follow_external_dependency=True) + dagger_tasks.append(athena_task) + elif node.get("config", {}).get("materialized") == "ephemeral": task = self._get_dummy_task(node, follow_external_dependency=True) dagger_tasks.append(task) - elif model_name.startswith("stg_"): + elif node.get("name").startswith("stg_"): source_node_names = node.get("depends_on", {}).get("nodes", []) + dagger_tasks.append(self._get_dummy_task(node)) for source_node_name in source_node_names: - if source_node_name.startswith("seed"): - source_node = self._nodes_in_manifest[source_node_name] - task = self._get_dummy_task(source_node) - else: - source_node = self._sources_in_manifest[source_node_name] - task = self._get_athena_task( - source_node, follow_external_dependency=True - ) - - dagger_tasks.append(task) + task = self._generate_dagger_tasks(source_node_name) + dagger_tasks.extend(task) else: athena_task = self._get_athena_task(node, follow_external_dependency=True) s3_task = self._get_s3_task(node) @@ -185,7 +189,7 @@ def _get_model_data_location( return bucket_name, data_path - def generate_dagger_io(self, model_name: str) -> Tuple[list, list]: + def generate_dagger_io(self, model_name: str) -> Tuple[List[dict], List[dict]]: """ Parse through all the parents of the DBT model and return the dagger inputs and outputs for the DBT model Args: @@ -201,9 +205,8 @@ def generate_dagger_io(self, model_name: str) -> Tuple[list, list]: parent_node_names = model_node.get("depends_on", {}).get("nodes", []) - for index, parent_node_name in enumerate(parent_node_names): - parent_model_node = self._nodes_in_manifest.get(parent_node_name) - dagger_input = self._generate_dagger_tasks(parent_model_node) + for parent_node_name in parent_node_names: + dagger_input = self._generate_dagger_tasks(parent_node_name) inputs_list += dagger_input From ab482292545cfb1b3b57912c56714a01235e90d1 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 29 Nov 2023 15:32:49 +0100 Subject: [PATCH 052/134] updated fixtures and tests --- .../modules/dbt_config_parser_fixtures.py | 34 ++++++++++++++++++- tests/utilities/test_dbt_config_parser.py | 28 +++++++-------- 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index ab887d4..432b2a3 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -114,6 +114,7 @@ "source_name": "table1", "database": "awsdatacatalog", "schema": "core_schema1", + "resource_type": "source", "unique_id": "source.main.core_schema1.table1", "name": "table1", "tags": ["analytics"], @@ -123,6 +124,7 @@ "source_name": "table2", "database": "awsdatacatalog", "schema": "core_schema2", + "resource_type": "source", "unique_id": "source.main.core_schema2.table2", "name": "table2", "tags": ["analytics"], @@ -132,6 +134,7 @@ "source_name": "table3", "database": "awsdatacatalog", "schema": "core_schema2", + "resource_type": "source", "unique_id": "source.main.core_schema2.table3", "name": "table3", "tags": ["analytics"], @@ -161,15 +164,17 @@ } EXPECTED_STAGING_NODE = [ + {"name": "stg_core_schema1__table1", "type": "dummy"}, { "type": "athena", "name": "core_schema1__table1_athena", "schema": "core_schema1", "table": "table1", "follow_external_dependency": True, - } + }, ] EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES = [ + {"name": "stg_core_schema2__table2", "type": "dummy"}, { "type": "athena", "name": "core_schema2__table2_athena", @@ -220,6 +225,7 @@ "type": "dummy", "name": "seed_buyer_country_overwrite", }, + {"name": "stg_core_schema2__table2", "type": "dummy"}, { "type": "athena", "name": "core_schema2__table2_athena", @@ -245,6 +251,7 @@ ] EXPECTED_DAGGER_INPUTS = [ + {"name": "stg_core_schema2__table2", "type": "dummy"}, { "name": "core_schema2__table2_athena", "schema": "core_schema2", @@ -280,6 +287,24 @@ }, ] +EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS = [ + { + "follow_external_dependency": True, + "name": "core_schema2__table2_athena", + "schema": "core_schema2", + "table": "table2", + "type": "athena", + }, + { + "follow_external_dependency": True, + "name": "core_schema2__table3_athena", + "schema": "core_schema2", + "table": "table3", + "type": "athena", + }, + {"name": "seed_buyer_country_overwrite", "type": "dummy"}, +] + EXPECTED_DAGGER_OUTPUTS = [ { "name": "analytics_engineering__model1_athena", @@ -294,3 +319,10 @@ "type": "s3", }, ] + +EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS = [ + { + "type": "dummy", + "name": "stg_core_schema2__table2", + }, +] diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 7c3557c..be9b3dc 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -15,6 +15,8 @@ EXPECTED_SEED_NODE, EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, EXPECTED_EPHEMERAL_NODE, + EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS, + EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS, ) _logger = logging.getLogger("root") @@ -48,25 +50,19 @@ def test_generate_task_configs(self): def test_generate_dagger_inputs(self): test_inputs = [ ( - DBT_MANIFEST_FILE_FIXTURE["nodes"][ - "model.main.stg_core_schema1__table1" - ], + "model.main.stg_core_schema1__table1", EXPECTED_STAGING_NODE, ), ( - DBT_MANIFEST_FILE_FIXTURE["nodes"][ - "model.main.stg_core_schema2__table2" - ], + "model.main.stg_core_schema2__table2", EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, ), ( - DBT_MANIFEST_FILE_FIXTURE["nodes"][ - "seed.main.seed_buyer_country_overwrite" - ], + "seed.main.seed_buyer_country_overwrite", EXPECTED_SEED_NODE, ), ( - DBT_MANIFEST_FILE_FIXTURE["nodes"]["model.main.int_model3"], + "model.main.int_model3", EXPECTED_EPHEMERAL_NODE, ), ] @@ -81,6 +77,7 @@ def test_generate_io_inputs(self): "model3", EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, ), + ("stg_core_schema2__table2", EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS), ] for mock_input, expected_output in fixtures: result, _ = self._dbt_config_parser.generate_dagger_io(mock_input) @@ -88,8 +85,11 @@ def test_generate_io_inputs(self): self.assertListEqual(result, expected_output) def test_generate_io_outputs(self): - _, result = self._dbt_config_parser.generate_dagger_io( - self._sample_dbt_node.get("name") - ) + fixtures = [ + ("model1", EXPECTED_DAGGER_OUTPUTS), + ("stg_core_schema2__table2", EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS), + ] + for mock_input, expected_output in fixtures: + _, result = self._dbt_config_parser.generate_dagger_io(mock_input) - self.assertListEqual(result, EXPECTED_DAGGER_OUTPUTS) + self.assertListEqual(result, expected_output) From d653977097c11342f015ab5c0b39e202da55250b Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 4 Dec 2023 18:47:01 +0100 Subject: [PATCH 053/134] bugfix --- dagger/dag_creator/airflow/dag_creator.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dagger/dag_creator/airflow/dag_creator.py b/dagger/dag_creator/airflow/dag_creator.py index 5db416f..2b208b9 100644 --- a/dagger/dag_creator/airflow/dag_creator.py +++ b/dagger/dag_creator/airflow/dag_creator.py @@ -163,13 +163,15 @@ def _create_edge_without_data(self, from_task_id: str, to_task_ids: list, node: external_task_sensor = self._get_external_task_sensor( from_task_id, to_task_id, edge_properties.follow_external_dependency ) - self._sensor_dict[to_pipe] = { + + if self._sensor_dict.get(to_pipe) is None: + self._sensor_dict[to_pipe] = {} + + self._sensor_dict[to_pipe].update({ external_task_sensor_name: external_task_sensor - } - ( - self._tasks[self._get_control_flow_task_id(to_pipe)] - >> external_task_sensor - ) + }) + + self._tasks[self._get_control_flow_task_id(to_pipe)] >> external_task_sensor self._sensor_dict[to_pipe][external_task_sensor_name] >> self._tasks[to_task_id] else: self._tasks[self._get_control_flow_task_id(to_pipe)] >> self._tasks[to_task_id] From 48a102fcecfe3dda9c5e280e33ab081f4319f4f1 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 4 Dec 2023 19:16:58 +0100 Subject: [PATCH 054/134] only return dummy when stg model --- dagger/utilities/dbt_config_parser.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 6c80cb4..dd16008 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -143,18 +143,14 @@ def _generate_dagger_tasks( if node.get("resource_type") == "seed": task = self._get_dummy_task(node) dagger_tasks.append(task) - elif node.get("resource_type") == 'source': + elif node.get("resource_type") == "source": athena_task = self._get_athena_task(node, follow_external_dependency=True) dagger_tasks.append(athena_task) elif node.get("config", {}).get("materialized") == "ephemeral": task = self._get_dummy_task(node, follow_external_dependency=True) dagger_tasks.append(task) elif node.get("name").startswith("stg_"): - source_node_names = node.get("depends_on", {}).get("nodes", []) dagger_tasks.append(self._get_dummy_task(node)) - for source_node_name in source_node_names: - task = self._generate_dagger_tasks(source_node_name) - dagger_tasks.extend(task) else: athena_task = self._get_athena_task(node, follow_external_dependency=True) s3_task = self._get_s3_task(node) From f6c7226606956e74e91cb19e58207f8897fe97a1 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 4 Dec 2023 19:19:37 +0100 Subject: [PATCH 055/134] adapted tests --- .../modules/dbt_config_parser_fixtures.py | 68 ++++--------------- tests/utilities/test_dbt_config_parser.py | 12 ++-- 2 files changed, 20 insertions(+), 60 deletions(-) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index 432b2a3..2f42778 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -165,34 +165,6 @@ EXPECTED_STAGING_NODE = [ {"name": "stg_core_schema1__table1", "type": "dummy"}, - { - "type": "athena", - "name": "core_schema1__table1_athena", - "schema": "core_schema1", - "table": "table1", - "follow_external_dependency": True, - }, -] -EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES = [ - {"name": "stg_core_schema2__table2", "type": "dummy"}, - { - "type": "athena", - "name": "core_schema2__table2_athena", - "schema": "core_schema2", - "table": "table2", - "follow_external_dependency": True, - }, - { - "type": "athena", - "name": "core_schema2__table3_athena", - "schema": "core_schema2", - "table": "table3", - "follow_external_dependency": True, - }, - { - "type": "dummy", - "name": "seed_buyer_country_overwrite", - }, ] EXPECTED_SEED_NODE = [ @@ -225,21 +197,7 @@ "type": "dummy", "name": "seed_buyer_country_overwrite", }, - {"name": "stg_core_schema2__table2", "type": "dummy"}, - { - "type": "athena", - "name": "core_schema2__table2_athena", - "schema": "core_schema2", - "table": "table2", - "follow_external_dependency": True, - }, - { - "type": "athena", - "name": "core_schema2__table3_athena", - "schema": "core_schema2", - "table": "table3", - "follow_external_dependency": True, - }, + {"name": "stg_core_schema2__table2", "type": "dummy"} ] EXPECTED_EPHEMERAL_NODE = [ @@ -250,23 +208,24 @@ } ] -EXPECTED_DAGGER_INPUTS = [ - {"name": "stg_core_schema2__table2", "type": "dummy"}, +EXPECTED_MODEL_NODE = [ { - "name": "core_schema2__table2_athena", - "schema": "core_schema2", - "table": "table2", "type": "athena", + "name": "analytics_engineering__model1_athena", + "schema": "analytics_engineering", + "table": "model1", "follow_external_dependency": True, }, { - "name": "core_schema2__table3_athena", - "schema": "core_schema2", - "table": "table3", - "type": "athena", - "follow_external_dependency": True, + "bucket": "bucket1-data-lake", + "name": "analytics_engineering__model1_s3", + "path": "path1/model1", + "type": "s3", }, - {"name": "seed_buyer_country_overwrite", "type": "dummy"}, +] + +EXPECTED_DAGGER_INPUTS = [ + {"name": "stg_core_schema2__table2", "type": "dummy"}, { "name": "analytics_engineering__model2_athena", "schema": "analytics_engineering", @@ -285,6 +244,7 @@ "name": "int_model3", "follow_external_dependency": True, }, + {"name": "seed_buyer_country_overwrite", "type": "dummy"}, ] EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS = [ diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index be9b3dc..c03976d 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -11,12 +11,12 @@ DBT_MANIFEST_FILE_FIXTURE, DBT_PROFILE_FIXTURE, EXPECTED_STAGING_NODE, - EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, EXPECTED_SEED_NODE, EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, EXPECTED_EPHEMERAL_NODE, EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS, EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS, + EXPECTED_MODEL_NODE, ) _logger = logging.getLogger("root") @@ -47,16 +47,12 @@ def test_generate_task_configs(self): module.generate_task_configs() - def test_generate_dagger_inputs(self): + def test_generate_dagger_tasks(self): test_inputs = [ ( "model.main.stg_core_schema1__table1", EXPECTED_STAGING_NODE, ), - ( - "model.main.stg_core_schema2__table2", - EXPECTED_STAGING_NODE_MULTIPLE_DEPENDENCIES, - ), ( "seed.main.seed_buyer_country_overwrite", EXPECTED_SEED_NODE, @@ -65,6 +61,10 @@ def test_generate_dagger_inputs(self): "model.main.int_model3", EXPECTED_EPHEMERAL_NODE, ), + ( + "model.main.model1", + EXPECTED_MODEL_NODE, + ), ] for mock_input, expected_output in test_inputs: result = self._dbt_config_parser._generate_dagger_tasks(mock_input) From 146846e7953db1999265510cbc74dc3344f57de2 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 6 Dec 2023 12:57:57 +0100 Subject: [PATCH 056/134] initialize dbt module only when its a dbt pipeline config --- dagger/utilities/module.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 7b954e3..5c81e04 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -22,7 +22,13 @@ def __init__(self, path_to_config, target_dir): self._branches_to_generate = config["branches_to_generate"] self._override_parameters = config.get("override_parameters", {}) self._default_parameters = config.get("default_parameters", {}) - self._dbt_module = DBTConfigParser(self._default_parameters) + + if ( + "dbt_profile" in self._default_parameters.keys() + and "project_dir" in self._default_parameters.keys() + and "profile_dir" in self._default_parameters.keys() + ): + self._dbt_module = DBTConfigParser(self._default_parameters) @staticmethod def read_yaml(yaml_str): From fd2bd0f365ad94b2dc837b0ac39a0419be7bcf1f Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 6 Dec 2023 12:58:16 +0100 Subject: [PATCH 057/134] format --- dagger/utilities/module.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 5c81e04..ade94e3 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -17,7 +17,9 @@ def __init__(self, path_to_config, target_dir): self._tasks = {} for task in config["tasks"]: - self._tasks[task] = self.read_task_config(f"{path.join(self._directory, task)}.yaml") + self._tasks[task] = self.read_task_config( + f"{path.join(self._directory, task)}.yaml" + ) self._branches_to_generate = config["branches_to_generate"] self._override_parameters = config.get("override_parameters", {}) @@ -54,7 +56,7 @@ def replace_template_parameters(_task_str, _template_parameters): if type(_value) == str: try: int_value = int(_value) - _value = f"\"{_value}\"" + _value = f'"{_value}"' except: pass locals()[_key] = _value @@ -90,17 +92,19 @@ def generate_task_configs(self): ) task_dict = yaml.safe_load(task_str) - if task == 'dbt': + if task == "dbt": inputs, outputs = self._dbt_module.generate_dagger_io(branch_name) - task_dict['inputs'] = inputs - task_dict['outputs'] = outputs - task_dict['task_parameters']['select'] = branch_name + task_dict["inputs"] = inputs + task_dict["outputs"] = outputs + task_dict["task_parameters"]["select"] = branch_name task_dict["autogenerated_by_dagger"] = self._path_to_config override_parameters = self._override_parameters or {} merge(task_dict, override_parameters.get(branch_name, {}).get(task, {})) - self.dump_yaml(task_dict, f"{path.join(self._target_dir, task_name)}.yaml") + self.dump_yaml( + task_dict, f"{path.join(self._target_dir, task_name)}.yaml" + ) @staticmethod def module_config_template(): From fb0c2e9cadc6fe6a1e3c6a933101a088bc943ec6 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 6 Dec 2023 16:14:59 +0100 Subject: [PATCH 058/134] made logic to check for dbt task easier --- dagger/utilities/module.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index ade94e3..d565ffe 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -25,11 +25,7 @@ def __init__(self, path_to_config, target_dir): self._override_parameters = config.get("override_parameters", {}) self._default_parameters = config.get("default_parameters", {}) - if ( - "dbt_profile" in self._default_parameters.keys() - and "project_dir" in self._default_parameters.keys() - and "profile_dir" in self._default_parameters.keys() - ): + if 'dbt' in self._tasks.keys(): self._dbt_module = DBTConfigParser(self._default_parameters) @staticmethod From a2b3fdda30196e09c84392de83c1f304191df950 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Dec 2023 19:54:18 +0100 Subject: [PATCH 059/134] fix: follow external dependency for staging models --- dagger/utilities/dbt_config_parser.py | 9 +++++++-- .../modules/dbt_config_parser_fixtures.py | 18 +++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index dd16008..15f31df 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -112,7 +112,10 @@ def _generate_dagger_output(self, node: dict): dict: The dagger output, which is a combination of an athena and s3 task for the DBT model node """ - if node.get("config", {}).get("materialized") in ("view", "ephemeral"): + if node.get("config", {}).get("materialized") in ( + "view", + "ephemeral", + ) or node.get("name").startswith("stg_"): return [self._get_dummy_task(node)] else: return [self._get_athena_task(node), self._get_s3_task(node)] @@ -150,7 +153,9 @@ def _generate_dagger_tasks( task = self._get_dummy_task(node, follow_external_dependency=True) dagger_tasks.append(task) elif node.get("name").startswith("stg_"): - dagger_tasks.append(self._get_dummy_task(node)) + dagger_tasks.append( + self._get_dummy_task(node, follow_external_dependency=True) + ) else: athena_task = self._get_athena_task(node, follow_external_dependency=True) s3_task = self._get_s3_task(node) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index 2f42778..90ebf03 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -164,7 +164,11 @@ } EXPECTED_STAGING_NODE = [ - {"name": "stg_core_schema1__table1", "type": "dummy"}, + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, + }, ] EXPECTED_SEED_NODE = [ @@ -197,7 +201,11 @@ "type": "dummy", "name": "seed_buyer_country_overwrite", }, - {"name": "stg_core_schema2__table2", "type": "dummy"} + { + "name": "stg_core_schema2__table2", + "type": "dummy", + "follow_external_dependency": True, + }, ] EXPECTED_EPHEMERAL_NODE = [ @@ -225,7 +233,11 @@ ] EXPECTED_DAGGER_INPUTS = [ - {"name": "stg_core_schema2__table2", "type": "dummy"}, + { + "name": "stg_core_schema2__table2", + "type": "dummy", + "follow_external_dependency": True, + }, { "name": "analytics_engineering__model2_athena", "schema": "analytics_engineering", From 365364687a7baa0b6e6326be1c8a541a10dafbf6 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 23 Feb 2024 11:36:26 +0530 Subject: [PATCH 060/134] added logic to get parents of int models this is done to keep track of dependencies of int models that are ephemeral --- dagger/utilities/dbt_config_parser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 15f31df..f12cccf 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -152,6 +152,10 @@ def _generate_dagger_tasks( elif node.get("config", {}).get("materialized") == "ephemeral": task = self._get_dummy_task(node, follow_external_dependency=True) dagger_tasks.append(task) + + ephemeral_parent_node_names = node.get("depends_on", {}).get("nodes", []) + for node_name in ephemeral_parent_node_names: + dagger_tasks += self._generate_dagger_tasks(node_name) elif node.get("name").startswith("stg_"): dagger_tasks.append( self._get_dummy_task(node, follow_external_dependency=True) From 7dec65aa51fa310cf8d1dfe210464efd15e48610 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 23 Feb 2024 11:37:26 +0530 Subject: [PATCH 061/134] updated tests and fixtures --- .../modules/dbt_config_parser_fixtures.py | 70 +++++++++++++++++-- tests/utilities/test_dbt_config_parser.py | 2 + 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures.py index 90ebf03..a28d871 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures.py @@ -78,6 +78,10 @@ "config": { "materialized": "ephemeral", }, + "depends_on": { + "macros": [], + "nodes": ["model.main.int_model2"], + }, }, "seed.main.seed_buyer_country_overwrite": { "database": "awsdatacatalog", @@ -108,6 +112,21 @@ ], }, }, + "model.main.int_model2": { + "name": "int_model2", + "unique_id": "model.main.int_model2", + "schema": "analytics_engineering", + "config": { + "materialized": "ephemeral", + }, + "depends_on": { + "macros": [], + "nodes": [ + "seed.main.seed_buyer_country_overwrite", + "model.main.stg_core_schema1__table1", + ], + }, + }, }, "sources": { "source.main.core_schema1.table1": { @@ -184,6 +203,20 @@ "name": "int_model3", "follow_external_dependency": True, }, + { + "type": "dummy", + "name": "int_model2", + "follow_external_dependency": True, + }, + { + "type": "dummy", + "name": "seed_buyer_country_overwrite", + }, + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, + }, { "type": "athena", "name": "analytics_engineering__model2_athena", @@ -197,10 +230,6 @@ "path": "path2/model2", "type": "s3", }, - { - "type": "dummy", - "name": "seed_buyer_country_overwrite", - }, { "name": "stg_core_schema2__table2", "type": "dummy", @@ -213,6 +242,20 @@ "type": "dummy", "name": "int_model3", "follow_external_dependency": True, + }, + { + "type": "dummy", + "name": "int_model2", + "follow_external_dependency": True, + }, + { + "type": "dummy", + "name": "seed_buyer_country_overwrite", + }, + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, } ] @@ -256,7 +299,17 @@ "name": "int_model3", "follow_external_dependency": True, }, + { + "type": "dummy", + "name": "int_model2", + "follow_external_dependency": True, + }, {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, + }, ] EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS = [ @@ -298,3 +351,12 @@ "name": "stg_core_schema2__table2", }, ] + +EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS = [ + {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, + }, +] diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index c03976d..3fd6394 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -17,6 +17,7 @@ EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS, EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS, EXPECTED_MODEL_NODE, + EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS ) _logger = logging.getLogger("root") @@ -78,6 +79,7 @@ def test_generate_io_inputs(self): EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, ), ("stg_core_schema2__table2", EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS), + ("int_model2", EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS), ] for mock_input, expected_output in fixtures: result, _ = self._dbt_config_parser.generate_dagger_io(mock_input) From bdc3e57babf7a74caa8bc3d99b3515f0c72a9b30 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Mon, 25 Mar 2024 20:33:40 +0100 Subject: [PATCH 062/134] Turing split_statements on by default --- dagger/dag_creator/airflow/operators/postgres_operator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/dag_creator/airflow/operators/postgres_operator.py b/dagger/dag_creator/airflow/operators/postgres_operator.py index b833516..c01b255 100644 --- a/dagger/dag_creator/airflow/operators/postgres_operator.py +++ b/dagger/dag_creator/airflow/operators/postgres_operator.py @@ -51,6 +51,6 @@ def execute(self, context): self.hook = PostgresHook( postgres_conn_id=self.postgres_conn_id, schema=self.database ) - self.hook.run(self.sql, self.autocommit, parameters=self.parameters) + self.hook.run(self.sql, self.autocommit, parameters=self.parameters, split_statements=True) for output in self.hook.conn.notices: self.log.info(output) From 96ad27b584ba27acbb9dbe54f418426fd47764f7 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Sat, 13 Apr 2024 20:59:23 +0200 Subject: [PATCH 063/134] Moving to python3.9; Upgrading airflow version; removing legacy postgres operator --- Makefile | 6 ++-- .../redshift_load_creator.py | 5 ++- .../redshift_transform_creator.py | 6 ++-- .../redshift_unload_creator.py | 7 +++-- .../airflow/operators/postgres_operator.py | 2 +- reqs/dev.txt | 31 +++++++++---------- reqs/test.txt | 2 +- setup.py | 3 +- 8 files changed, 30 insertions(+), 32 deletions(-) diff --git a/Makefile b/Makefile index 02daa9e..8872dba 100644 --- a/Makefile +++ b/Makefile @@ -96,15 +96,15 @@ install: clean ## install the package to the active Python's site-packages install-dev: clean ## install the package to the active Python's site-packages - virtualenv -p python3 venv; \ + virtualenv -p python3.9 venv; \ source venv/bin/activate; \ python -m pip install --upgrade pip; \ python setup.py install; \ pip install -e . ; \ - pip install -r reqs/dev.txt -r reqs/test.txt + SYSTEM_VERSION_COMPAT=0 CFLAGS='-std=c++20' pip install -r reqs/dev.txt -r reqs/test.txt install-test: clean ## install the package to the active Python's site-packages - virtualenv -p python3 venv; \ + virtualenv -p python3.9 venv; \ source venv/bin/activate; \ python -m pip install --upgrade pip; \ pip install -r reqs/test.txt -r reqs/base.txt diff --git a/dagger/dag_creator/airflow/operator_creators/redshift_load_creator.py b/dagger/dag_creator/airflow/operator_creators/redshift_load_creator.py index 8d14182..f1576f4 100644 --- a/dagger/dag_creator/airflow/operator_creators/redshift_load_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/redshift_load_creator.py @@ -2,9 +2,8 @@ from typing import Optional from dagger.dag_creator.airflow.operator_creator import OperatorCreator -from dagger.dag_creator.airflow.operators.redshift_sql_operator import ( - RedshiftSQLOperator, -) +from dagger.dag_creator.airflow.operators.redshift_sql_operator import RedshiftSQLOperator + class RedshiftLoadCreator(OperatorCreator): diff --git a/dagger/dag_creator/airflow/operator_creators/redshift_transform_creator.py b/dagger/dag_creator/airflow/operator_creators/redshift_transform_creator.py index 0218a6f..c8eb8dd 100644 --- a/dagger/dag_creator/airflow/operator_creators/redshift_transform_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/redshift_transform_creator.py @@ -1,7 +1,7 @@ from os.path import join from dagger.dag_creator.airflow.operator_creator import OperatorCreator -from dagger.dag_creator.airflow.operators.postgres_operator import PostgresOperator +from dagger.dag_creator.airflow.operators.redshift_sql_operator import RedshiftSQLOperator class RedshiftTransformCreator(OperatorCreator): @@ -22,11 +22,11 @@ def _read_sql(directory, file_path): def _create_operator(self, **kwargs): sql_string = self._read_sql(self._task.pipeline.directory, self._task.sql_file) - redshift_op = PostgresOperator( + redshift_op = RedshiftSQLOperator( dag=self._dag, task_id=self._task.name, sql=sql_string, - postgres_conn_id=self._task.postgres_conn_id, + redshift_conn_id=self._task.postgres_conn_id, params=self._template_parameters, **kwargs, ) diff --git a/dagger/dag_creator/airflow/operator_creators/redshift_unload_creator.py b/dagger/dag_creator/airflow/operator_creators/redshift_unload_creator.py index 7fd74d7..cb7be04 100644 --- a/dagger/dag_creator/airflow/operator_creators/redshift_unload_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/redshift_unload_creator.py @@ -1,7 +1,7 @@ from os.path import join from dagger.dag_creator.airflow.operator_creator import OperatorCreator -from dagger.dag_creator.airflow.operators.postgres_operator import PostgresOperator +from dagger.dag_creator.airflow.operators.redshift_sql_operator import RedshiftSQLOperator REDSHIFT_UNLOAD_CMD = """ unload ('{sql_string}') @@ -58,12 +58,13 @@ def _create_operator(self, **kwargs): unload_cmd = self._get_unload_command(sql_string) - redshift_op = PostgresOperator( + redshift_op = RedshiftSQLOperator( dag=self._dag, task_id=self._task.name, sql=unload_cmd, - postgres_conn_id=self._task.postgres_conn_id, + redshift_conn_id=self._task.postgres_conn_id, params=self._template_parameters, + autocommit=True, **kwargs, ) diff --git a/dagger/dag_creator/airflow/operators/postgres_operator.py b/dagger/dag_creator/airflow/operators/postgres_operator.py index c01b255..ce90250 100644 --- a/dagger/dag_creator/airflow/operators/postgres_operator.py +++ b/dagger/dag_creator/airflow/operators/postgres_operator.py @@ -1,6 +1,6 @@ from typing import Iterable, Mapping, Optional, Union -from airflow.hooks.postgres_hook import PostgresHook +from airflow.providers.postgres.hooks.postgres import PostgresHook from airflow.utils.decorators import apply_defaults from dagger.dag_creator.airflow.operators.dagger_base_operator import DaggerBaseOperator diff --git a/reqs/dev.txt b/reqs/dev.txt index 806d6c5..c52136a 100644 --- a/reqs/dev.txt +++ b/reqs/dev.txt @@ -1,19 +1,18 @@ -apache-airflow[amazon,postgres,s3,statsd]==2.3.4 +pip==24.0 +apache-airflow[amazon,postgres,s3,statsd]==2.9.0 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.9.0/constraints-3.9.txt" black==22.10.0 -boto3==1.26.16 +boto3==1.34.82 bumpversion==0.6.0 -coverage==6.5.0 -elasticsearch==7.17.7 -flake8==5.0.4 -neo4j==5.2.1 -numpydoc==1.5.0 -pip==22.3.1 +coverage==7.4.4 +#elasticsearch==7.17.7 +flake8==7.0.0 +#neo4j==5.19.0 +numpydoc==1.7.0 pre-commit==2.20.0 -sphinx-rtd-theme==1.1.1 -Sphinx==4.3.2 -SQLAlchemy==1.4.44 -tox==3.27.1 -twine==4.0.1 -watchdog==2.1.9 -Werkzeug==2.2.2 -wheel==0.38.4 +sphinx-rtd-theme==2.0.0 +Sphinx==7.2.6 +SQLAlchemy +tox==4.14.2 +twine==5.0.0 +watchdog==4.0.0 +Werkzeug diff --git a/reqs/test.txt b/reqs/test.txt index c568f77..195932d 100644 --- a/reqs/test.txt +++ b/reqs/test.txt @@ -1,3 +1,3 @@ -apache-airflow[amazon,postgres,s3,statsd]==2.3.4 +apache-airflow[amazon,postgres,s3,statsd]==2.9.0 pytest-cov==4.0.0 pytest==7.2.0 diff --git a/setup.py b/setup.py index 3f80fe3..080a5bb 100644 --- a/setup.py +++ b/setup.py @@ -45,8 +45,7 @@ def reqs(*f): classifiers=[ "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Developers", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.9", ], description="Config Driven ETL", entry_points={"console_scripts": ["dagger=dagger.main:cli"]}, From 3e62d78068b396779e89b2ddedd32c87eff3cf57 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Mon, 15 Apr 2024 11:00:22 +0200 Subject: [PATCH 064/134] Making sensor default args more flexible --- dagger/conf.py | 4 +--- dagger/dag_creator/airflow/dag_creator.py | 8 ++------ dagger/dagger_config.yaml | 8 +++++--- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/dagger/conf.py b/dagger/conf.py index 667c207..6b5488f 100644 --- a/dagger/conf.py +++ b/dagger/conf.py @@ -21,9 +21,7 @@ # Airflow parameters airflow_config = config.get('airflow', None) or {} WITH_DATA_NODES = airflow_config.get('with_data_nodes', False) -EXTERNAL_SENSOR_POKE_INTERVAL = airflow_config.get('external_sensor_poke_interval', 600) -EXTERNAL_SENSOR_TIMEOUT = airflow_config.get('external_sensor_timeout', 28800) -EXTERNAL_SENSOR_MODE = airflow_config.get('external_sensor_mode', 'reschedule') +EXTERNAL_SENSOR_DEFAULT_ARGS = airflow_config.get('external_sensor_default_args', {}) IS_DUMMY_OPERATOR_SHORT_CIRCUIT = airflow_config.get('is_dummy_operator_short_circuit', False) # Neo4j parameters diff --git a/dagger/dag_creator/airflow/dag_creator.py b/dagger/dag_creator/airflow/dag_creator.py index 2b208b9..031a3a4 100644 --- a/dagger/dag_creator/airflow/dag_creator.py +++ b/dagger/dag_creator/airflow/dag_creator.py @@ -72,12 +72,7 @@ def _get_external_task_sensor(self, from_task_id: str, to_task_id: str, follow_e to_pipe_id = self._task_graph.get_node(to_task_id).obj.pipeline.name - - extra_args = { - 'mode': conf.EXTERNAL_SENSOR_MODE, - 'poke_interval': conf.EXTERNAL_SENSOR_POKE_INTERVAL, - 'timeout': conf.EXTERNAL_SENSOR_TIMEOUT, - } + extra_args = conf.EXTERNAL_SENSOR_DEFAULT_ARGS.copy() extra_args.update(follow_external_dependency) return ExternalTaskSensor( @@ -141,6 +136,7 @@ def _create_edge_without_data(self, from_task_id: str, to_task_ids: list, node: to_task_ids: The IDs of the tasks to which the edge connects. node: The current node in a task graph. """ + from_pipe = ( self._task_graph.get_node(from_task_id).obj.pipeline_name if from_task_id else None ) diff --git a/dagger/dagger_config.yaml b/dagger/dagger_config.yaml index 9eac6ff..3366828 100644 --- a/dagger/dagger_config.yaml +++ b/dagger/dagger_config.yaml @@ -1,8 +1,10 @@ airflow: + external_sensor_default_args: + poll_interval: 30 + timeout: 28800 + mode: reschedule + deferrable: true with_data_node: false - external_sensor_poke_interval: 600 - external_sensor_timeout: 28800 - external_sensor_mode: reschedule is_dummy_operator_short_circuit: false From 41f15544787651da6b9a2b3e085a766656f93226 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Mon, 15 Apr 2024 14:15:44 +0200 Subject: [PATCH 065/134] Upgrading python in CI --- .github/workflows/ci-data.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-data.yml b/.github/workflows/ci-data.yml index 599d325..bef5bed 100644 --- a/.github/workflows/ci-data.yml +++ b/.github/workflows/ci-data.yml @@ -17,10 +17,10 @@ jobs: with: persist-credentials: false - - name: Set up Python 3.7 + - name: Set up Python 3.9 uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.9 - name: Install dependencies run: | From d8145ab6b0ccda834b58f107ef3fd7ed5a1a83a0 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Mon, 15 Apr 2024 14:25:25 +0200 Subject: [PATCH 066/134] Adding graphviz dependency to test --- reqs/dev.txt | 4 ++-- reqs/test.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/reqs/dev.txt b/reqs/dev.txt index c52136a..238b2e2 100644 --- a/reqs/dev.txt +++ b/reqs/dev.txt @@ -4,9 +4,9 @@ black==22.10.0 boto3==1.34.82 bumpversion==0.6.0 coverage==7.4.4 -#elasticsearch==7.17.7 +elasticsearch==7.17.7 flake8==7.0.0 -#neo4j==5.19.0 +neo4j==5.19.0 numpydoc==1.7.0 pre-commit==2.20.0 sphinx-rtd-theme==2.0.0 diff --git a/reqs/test.txt b/reqs/test.txt index 195932d..7bdc89f 100644 --- a/reqs/test.txt +++ b/reqs/test.txt @@ -1,3 +1,4 @@ apache-airflow[amazon,postgres,s3,statsd]==2.9.0 pytest-cov==4.0.0 pytest==7.2.0 +graphviz From 8cc2ec2640220ec2f5e605a71dc050a68cc3df9c Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Tue, 16 Apr 2024 12:59:04 +0200 Subject: [PATCH 067/134] Upgrading some package versions to remove warnings --- reqs/base.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reqs/base.txt b/reqs/base.txt index 877d102..d9cc38a 100644 --- a/reqs/base.txt +++ b/reqs/base.txt @@ -1,7 +1,7 @@ click==8.1.3 -croniter==1.3.8 +croniter==2.0.2 envyaml==1.10.211231 mergedeep==1.3.4 slack==0.0.2 slackclient==2.9.4 -tenacity==8.2.0 +tenacity==8.2.3 From f56e6b62890bb41fabb9cae394da7b777d43c16b Mon Sep 17 00:00:00 2001 From: claudiazi Date: Wed, 17 Apr 2024 10:31:57 +0200 Subject: [PATCH 068/134] feat: rename profile_name to target_name --- .../dag_creator/airflow/operator_creators/dbt_creator.py | 4 ++-- dagger/pipeline/tasks/dbt_task.py | 8 ++++---- dockers/airflow/airflow.cfg | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py index 1c16835..38b9c34 100644 --- a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py @@ -12,7 +12,7 @@ def __init__(self, task, dag): self._project_dir = task.project_dir self._profile_dir = task.profile_dir - self._profile_name = task.profile_name + self._target_name = task.target_name self._select = task.select self._dbt_command = task.dbt_command @@ -20,7 +20,7 @@ def _generate_command(self): command = [self._task.executable_prefix, self._task.executable] command.append(f"--project_dir={self._project_dir}") command.append(f"--profiles_dir={self._profile_dir}") - command.append(f"--profile_name={self._profile_name}") + command.append(f"--target_name={self._target_name}") command.append(f"--dbt_command={self._dbt_command}") if self._select: command.append(f"--select={self._select}") diff --git a/dagger/pipeline/tasks/dbt_task.py b/dagger/pipeline/tasks/dbt_task.py index 33b9c1a..c59cdd6 100644 --- a/dagger/pipeline/tasks/dbt_task.py +++ b/dagger/pipeline/tasks/dbt_task.py @@ -20,7 +20,7 @@ def init_attributes(cls, orig_cls): comment="Which directory to look in for the profiles.yml file", ), Attribute( - attribute_name="profile_name", + attribute_name="target_name", required=False, parent_fields=["task_parameters"], comment="Which target to load for the given profile " @@ -45,7 +45,7 @@ def __init__(self, name, pipeline_name, pipeline, job_config): self._project_dir = self.parse_attribute("project_dir") self._profile_dir = self.parse_attribute("profile_dir") - self._profile_name = self.parse_attribute("profile_name") or "default" + self._target_name = self.parse_attribute("target_name") or "default" self._select = self.parse_attribute("select") self._dbt_command = self.parse_attribute("dbt_command") @@ -58,8 +58,8 @@ def profile_dir(self): return self._profile_dir @property - def profile_name(self): - return self._profile_name + def target_name(self): + return self._target_name @property def select(self): diff --git a/dockers/airflow/airflow.cfg b/dockers/airflow/airflow.cfg index a5ace87..0b19fbd 100644 --- a/dockers/airflow/airflow.cfg +++ b/dockers/airflow/airflow.cfg @@ -434,7 +434,7 @@ backend = # The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class. # See documentation for the secrets backend you are using. JSON is expected. # Example for AWS Systems Manager ParameterStore: -# ``{{"connections_prefix": "/airflow/connections", "profile_name": "default"}}`` +# ``{{"connections_prefix": "/airflow/connections", "target_name": "default"}}`` backend_kwargs = [cli] From 1b99357a25766888d334b9a2858130070c1b7f1c Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 26 Apr 2024 10:38:35 +0200 Subject: [PATCH 069/134] feat: register new databricks_io --- dagger/pipeline/io_factory.py | 3 +- dagger/pipeline/ios/databricks_io.py | 48 +++++++++++++++++++ .../fixtures/pipeline/ios/databricks_io.yaml | 11 +++++ tests/pipeline/ios/test_databricks_io.py | 17 +++++++ 4 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 dagger/pipeline/ios/databricks_io.py create mode 100644 tests/fixtures/pipeline/ios/databricks_io.yaml create mode 100644 tests/pipeline/ios/test_databricks_io.py diff --git a/dagger/pipeline/io_factory.py b/dagger/pipeline/io_factory.py index 782fd14..5454f31 100644 --- a/dagger/pipeline/io_factory.py +++ b/dagger/pipeline/io_factory.py @@ -6,7 +6,8 @@ dummy_io, gdrive_io, redshift_io, - s3_io + s3_io, + databricks_io ) from dagger.utilities.classes import get_deep_obj_subclasses diff --git a/dagger/pipeline/ios/databricks_io.py b/dagger/pipeline/ios/databricks_io.py new file mode 100644 index 0000000..dd9041b --- /dev/null +++ b/dagger/pipeline/ios/databricks_io.py @@ -0,0 +1,48 @@ +from dagger.pipeline.io import IO +from dagger.utilities.config_validator import Attribute + + +class DatabricksIO(IO): + ref_name = "databricks" + + @classmethod + def init_attributes(cls, orig_cls): + cls.add_config_attributes( + [ + Attribute(attribute_name="catalog"), + Attribute( + attribute_name="schema" + ), + Attribute(attribute_name="table"), + ] + ) + + def __init__(self, io_config, config_location): + super().__init__(io_config, config_location) + + self._catalog = self.parse_attribute("catalog") + self._schema = self.parse_attribute("schema") + self._table = self.parse_attribute("table") + + def alias(self): + return f"databricks://{self._catalog}/{self._schema}/{self._table}" + + @property + def rendered_name(self): + return f"{self._catalog}.{self._schema}.{self._table}" + + @property + def airflow_name(self): + return f"databricks-{self._catalog}-{self._schema}-{self._table}" + + @property + def catalog(self): + return self._catalog + + @property + def schema(self): + return self._schema + + @property + def table(self): + return self._table diff --git a/tests/fixtures/pipeline/ios/databricks_io.yaml b/tests/fixtures/pipeline/ios/databricks_io.yaml new file mode 100644 index 0000000..a8d5914 --- /dev/null +++ b/tests/fixtures/pipeline/ios/databricks_io.yaml @@ -0,0 +1,11 @@ +type: databricks +name: test +catalog: test_catalog +schema: test_schema +table: test_table + + + +# Other attributes: + +# has_dependency: # Weather this i/o should be added to the dependency graph or not. Default is True \ No newline at end of file diff --git a/tests/pipeline/ios/test_databricks_io.py b/tests/pipeline/ios/test_databricks_io.py new file mode 100644 index 0000000..b1d0c45 --- /dev/null +++ b/tests/pipeline/ios/test_databricks_io.py @@ -0,0 +1,17 @@ +import unittest +from dagger.pipeline.io_factory import databricks_io + +import yaml + + +class DbIOTest(unittest.TestCase): + def setUp(self) -> None: + with open('tests/fixtures/pipeline/ios/databricks_io.yaml', "r") as stream: + config = yaml.safe_load(stream) + + self.db_io = databricks_io.DatabricksIO(config, "/") + + def test_properties(self): + self.assertEqual(self.db_io.alias(), "databricks://test_catalog/test_schema/test_table") + self.assertEqual(self.db_io.rendered_name, "test_catalog.test_schema.test_table") + self.assertEqual(self.db_io.airflow_name, "databricks-test_catalog-test_schema-test_table") From e00f555d138f5d1bdfe661a62037d66583a98b00 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 26 Apr 2024 10:48:06 +0200 Subject: [PATCH 070/134] feat: refactor the DBTParseConfig to parse databricks-dbt manifest --- .../airflow/operator_creators/dbt_creator.py | 19 +- dagger/pipeline/tasks/dbt_task.py | 35 +- dagger/utilities/dbt_config_parser.py | 372 ++++++++++++------ dagger/utilities/module.py | 7 +- 4 files changed, 299 insertions(+), 134 deletions(-) diff --git a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py index 38b9c34..4b88fe3 100644 --- a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py @@ -12,29 +12,26 @@ def __init__(self, task, dag): self._project_dir = task.project_dir self._profile_dir = task.profile_dir + self._profile_name = task.profile_name self._target_name = task.target_name self._select = task.select self._dbt_command = task.dbt_command + self._vars = task.vars + # self._create_external_athena_table = task.create_external_athena_table def _generate_command(self): command = [self._task.executable_prefix, self._task.executable] command.append(f"--project_dir={self._project_dir}") command.append(f"--profiles_dir={self._profile_dir}") + command.append(f"--profile_name={self._profile_name}") command.append(f"--target_name={self._target_name}") command.append(f"--dbt_command={self._dbt_command}") if self._select: command.append(f"--select={self._select}") - - if len(self._template_parameters) > 0: - dbt_vars = json.dumps(self._template_parameters) + if self._vars: + dbt_vars = json.dumps(self._vars) command.append(f"--vars='{dbt_vars}'") + # if self._create_external_athena_table: + # command.append(f"--create_external_athena_table={self._create_external_athena_table}") return command - - # Overwriting function because for dbt we don't want to add inputs/outputs to the - # template parameters. - def create_operator(self): - self._template_parameters.update(self._task.template_parameters) - self._update_airflow_parameters() - - return self._create_operator(**self._airflow_parameters) diff --git a/dagger/pipeline/tasks/dbt_task.py b/dagger/pipeline/tasks/dbt_task.py index c59cdd6..aea0945 100644 --- a/dagger/pipeline/tasks/dbt_task.py +++ b/dagger/pipeline/tasks/dbt_task.py @@ -19,9 +19,13 @@ def init_attributes(cls, orig_cls): parent_fields=["task_parameters"], comment="Which directory to look in for the profiles.yml file", ), + Attribute( + attribute_name="profile_name", + parent_fields=["task_parameters"], + comment="Which profile to load from the profiles.yml file", + ), Attribute( attribute_name="target_name", - required=False, parent_fields=["task_parameters"], comment="Which target to load for the given profile " "(--target dbt option). Default is 'default'", @@ -37,6 +41,18 @@ def init_attributes(cls, orig_cls): parent_fields=["task_parameters"], comment="Specify the name of the DBT command to run", ), + Attribute( + attribute_name="vars", + required=False, + parent_fields=["task_parameters"], + comment="Specify the variables to pass to dbt", + ), + Attribute( + attribute_name="create_external_athena_table", + required=False, + parent_fields=["task_parameters"], + comment="Specify whether to create an external Athena table for the model", + ) ] ) @@ -45,9 +61,12 @@ def __init__(self, name, pipeline_name, pipeline, job_config): self._project_dir = self.parse_attribute("project_dir") self._profile_dir = self.parse_attribute("profile_dir") - self._target_name = self.parse_attribute("target_name") or "default" + self._profile_name = self.parse_attribute("profile_name") + self._target_name = self.parse_attribute("target_name") self._select = self.parse_attribute("select") self._dbt_command = self.parse_attribute("dbt_command") + self._vars = self.parse_attribute("vars") + self._create_external_athena_table = self.parse_attribute("create_external_athena_table") @property def project_dir(self): @@ -57,6 +76,10 @@ def project_dir(self): def profile_dir(self): return self._profile_dir + @property + def profile_name(self): + return self._profile_name + @property def target_name(self): return self._target_name @@ -68,3 +91,11 @@ def select(self): @property def dbt_command(self): return self._dbt_command + + @property + def vars(self): + return self._vars + + @property + def create_external_athena_table(self): + return self._create_external_athena_table \ No newline at end of file diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index f12cccf..6c2ae5d 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -1,42 +1,89 @@ import json +import yaml +from abc import ABC, abstractmethod from collections import OrderedDict from os import path from os.path import join from typing import Tuple, List, Dict +import logging -import yaml - +# Task base configurations ATHENA_TASK_BASE = {"type": "athena"} +DATABRICKS_TASK_BASE = {"type": "databricks"} S3_TASK_BASE = {"type": "s3"} +_logger = logging.getLogger("root") -class DBTConfigParser: - """ - Module that parses the manifest.json file generated by dbt and generates the dagger inputs and outputs for the respective dbt model - """ +class DBTConfigParser(ABC): + """Abstract base class for parsing dbt manifest.json files and generating task configurations.""" - def __init__(self, default_config_parameters: dict): - self._dbt_profile = default_config_parameters.get("dbt_profile", "data") - self._default_data_bucket = default_config_parameters["data_bucket"] - self._dbt_project_dir = default_config_parameters.get("project_dir", None) - dbt_manifest_path = path.join(self._dbt_project_dir, "target", "manifest.json") - self._dbt_profile_dir = default_config_parameters.get("profile_dir", None) - dbt_profile_path = path.join(self._dbt_profile_dir, "profiles.yml") - - with open(dbt_manifest_path, "r") as f: - data = f.read() - self._manifest_data = json.loads(data) - profile_yaml = yaml.safe_load(open(dbt_profile_path, "r")) - prod_dbt_profile = profile_yaml[self._dbt_project_dir.split("/")[-1]][ - "outputs" - ][self._dbt_profile] - self._default_data_dir = prod_dbt_profile.get( - "s3_data_dir" - ) or prod_dbt_profile.get("s3_staging_dir") - self._default_schema = prod_dbt_profile.get("schema") + def __init__(self, config_parameters: dict): + self._dbt_project_dir = config_parameters.get("project_dir") + self._profile_name = config_parameters.get("profile_name", "") + self._target_name = config_parameters.get("target_name", "") + self._dbt_profile_dir = config_parameters.get("profile_dir", None) + self._manifest_data = self._load_file( + self._get_manifest_path(), file_type="json" + ) + profile_data = self._load_file(self._get_profile_path(), file_type="yaml") + self._target_config = profile_data[self._profile_name]["outputs"][ + self._target_name + ] + self._default_schema = self._target_config.get("schema", "") + self._nodes_in_manifest = self._manifest_data.get("nodes", {}) + self._sources_in_manifest = self._manifest_data.get("sources", {}) + + def _get_manifest_path(self) -> str: + """ + Construct path for manifest.json file based on configuration parameters. + """ + target_path = f"{self._profile_name}_target" + return path.join(self._dbt_project_dir, target_path, "manifest.json") - self._nodes_in_manifest = self._manifest_data["nodes"] - self._sources_in_manifest = self._manifest_data["sources"] + def _get_profile_path(self) -> str: + """ + Construct path for profiles.yml file based on configuration parameters. + """ + return path.join(self._dbt_profile_dir, "profiles.yml") + + @staticmethod + def _load_file(file_path: str, file_type: str) -> dict: + """Load a file (JSON or YAML) based on the specified type and return its contents.""" + try: + with open(file_path, "r") as file: + if file_type == "json": + return json.load(file) + elif file_type == "yaml": + return yaml.safe_load(file) + except FileNotFoundError: + _logger.error(f"File not found: {file_path}") + exit(1) + + @abstractmethod + def _get_athena_table_task( + self, node: dict, follow_external_dependency: bool = False + ) -> dict: + """Generate an athena table task for a DBT node. Must be implemented by subclasses. This function should be deprecated after the source connects with databricks directly""" + pass + + @abstractmethod + def _get_table_task( + self, node: dict, follow_external_dependency: bool = False + ) -> dict: + """Generate a table task for a DBT node for the specific dbt-adapter. Must be implemented by subclasses.""" + pass + + @abstractmethod + def _get_model_data_location( + self, node: dict, schema: str, model_name: str + ) -> Tuple[str, str]: + """Get the S3 path of the DBT model relative to the data bucket. Must be implemented by subclasses.""" + pass + + @abstractmethod + def _get_s3_task(self, node: dict) -> dict: + """Generate an S3 task configuration based on a DBT node. Must be implemented by subclasses.""" + pass @staticmethod def _get_dummy_task(node: dict, follow_external_dependency: bool = False) -> dict: @@ -58,18 +105,107 @@ def _get_dummy_task(node: dict, follow_external_dependency: bool = False) -> dic return task - def _get_athena_task( - self, node: dict, follow_external_dependency: bool = False - ) -> dict: + @abstractmethod + def _generate_dagger_output(self, node: dict): + """Generate the dagger output for a DBT node. Must be implemented by subclasses.""" + pass + + def _generate_dagger_tasks(self, node_name: str) -> List[Dict]: """ - Generates the dagger athena task for the DBT model node + Generates the dagger task based on whether the DBT model node is a staging model or not. + If the DBT model node represents a DBT seed or an ephemeral model, then a dagger dummy task is generated. + If the DBT model node represents a staging model, then a dagger athena task is generated for each source of the DBT model. Apart from this, a dummy task is also generated for the staging model itself. + If the DBT model node is not a staging model, then a dagger athena task and an s3 task is generated for the DBT model node itself. Args: - node: The extracted node from the manifest.json file - follow_external_dependency: Whether to follow external airflow dependencies or not + node_name: The name of the DBT model node Returns: - dict: The dagger athena task for the DBT model node + List[Dict]: The respective dagger tasks for the DBT model node + """ + dagger_tasks = [] + + if node_name.startswith("source"): + node = self._sources_in_manifest[node_name] + else: + node = self._nodes_in_manifest[node_name] + + if node.get("resource_type") == "seed": + task = self._get_dummy_task(node) + dagger_tasks.append(task) + elif node.get("resource_type") == "source": + table_task = self._get_athena_table_task(node, follow_external_dependency=True) + dagger_tasks.append(table_task) + elif node.get("config", {}).get("materialized") == "ephemeral": + task = self._get_dummy_task(node, follow_external_dependency=True) + dagger_tasks.append(task) + + ephemeral_parent_node_names = node.get("depends_on", {}).get("nodes", []) + for node_name in ephemeral_parent_node_names: + dagger_tasks += self._generate_dagger_tasks(node_name) + elif node.get("name").startswith("stg_") or "preparation" in node.get( + "schema", "" + ): + dagger_tasks.append( + self._get_dummy_task(node, follow_external_dependency=True) + ) + else: + table_task = self._get_table_task(node, follow_external_dependency=True) + s3_task = self._get_s3_task(node) + + dagger_tasks.append(table_task) + dagger_tasks.append(s3_task) + + return dagger_tasks + + def generate_dagger_io(self, model_name: str) -> Tuple[List[dict], List[dict]]: + """ + Parse through all the parents of the DBT model and return the dagger inputs and outputs for the DBT model + Args: + model_name: The name of the DBT model + + Returns: + Tuple[list, list]: The dagger inputs and outputs for the DBT model + + """ + inputs_list = [] + model_node = self._nodes_in_manifest[f"model.main.{model_name}"] + parent_node_names = model_node.get("depends_on", {}).get("nodes", []) + print(f"parent node name: {parent_node_names}") + + for parent_node_name in parent_node_names: + dagger_input = self._generate_dagger_tasks(parent_node_name) + inputs_list += dagger_input + + output_list = self._generate_dagger_output(model_node) + + unique_inputs = list( + OrderedDict( + (frozenset(item.items()), item) for item in inputs_list + ).values() + ) + + print(unique_inputs) + + return unique_inputs, output_list + + +class AthenaDBTConfigParser(DBTConfigParser): + """Implementation for Athena configurations.""" + def __init__(self, default_config_parameters: dict): + super().__init__(default_config_parameters) + self._profile_name = "athena" + self._default_data_bucket = default_config_parameters.get("data_bucket") + self._default_data_dir = self._target_config.get( + "s3_data_dir" + ) or self._target_config.get("s3_staging_dir") + + + def _get_table_task( + self, node: dict, follow_external_dependency: bool = False + ) -> dict: + """ + Generates the dagger athena task for the DBT model node """ task = ATHENA_TASK_BASE.copy() if follow_external_dependency: @@ -81,6 +217,24 @@ def _get_athena_task( return task + def _get_athena_table_task(self, node: dict, follow_external_dependency: bool = False) -> dict: + return self._get_table_task(node, follow_external_dependency) + + def _get_model_data_location( + self, node: dict, schema: str, model_name: str + ) -> Tuple[str, str]: + """ + Gets the S3 path of the dbt model relative to the data bucket. + """ + location = node.get("config", {}).get("external_location") + if not location: + location = join(self._default_data_dir, schema, model_name) + + split = location.split("//")[1].split("/") + bucket_name, data_path = split[0], "/".join(split[1:]) + + return bucket_name, data_path + def _get_s3_task(self, node: dict) -> dict: """ Generates the dagger s3 task for the DBT model node @@ -93,17 +247,16 @@ def _get_s3_task(self, node: dict) -> dict: """ task = S3_TASK_BASE.copy() + schema = node.get("schema", self._default_schema) table = node.get("name", "") task["name"] = f"{schema}__{table}_s3" - task["bucket"] = self._default_data_bucket - task["path"] = self._get_model_data_location(node, schema, table)[1] - + task["bucket"], task["path"] = self._get_model_data_location(node, schema, table) return task def _generate_dagger_output(self, node: dict): """ - Generates the dagger output for the DBT model node. If the model is materialized as a view or ephemeral, then a dummy task is created. + Generates the dagger output for the DBT model node with athena-dbt adapter. If the model is materialized as a view or ephemeral, then a dummy task is created. Otherwise, an athena and s3 task is created for the DBT model node. Args: node: The extracted node from the manifest.json file @@ -118,109 +271,90 @@ def _generate_dagger_output(self, node: dict): ) or node.get("name").startswith("stg_"): return [self._get_dummy_task(node)] else: - return [self._get_athena_task(node), self._get_s3_task(node)] - - def _generate_dagger_tasks( - self, - node_name: str, - ) -> List[Dict]: - """ - Generates the dagger task based on whether the DBT model node is a staging model or not. - If the DBT model node represents a DBT seed or an ephemeral model, then a dagger dummy task is generated. - If the DBT model node represents a staging model, then a dagger athena task is generated for each source of the DBT model. Apart from this, a dummy task is also generated for the staging model itself. - If the DBT model node is not a staging model, then a dagger athena task and an s3 task is generated for the DBT model node itself. - Args: - node: The extracted node from the manifest.json file + return [self._get_table_task(node), self._get_s3_task(node)] - Returns: - List[Dict]: The respective dagger tasks for the DBT model node - """ - dagger_tasks = [] +class DatabricksDBTConfigParser(DBTConfigParser): + """Implementation for Databricks configurations.""" - if node_name.startswith("source"): - node = self._sources_in_manifest[node_name] - else: - node = self._nodes_in_manifest[node_name] + def __init__(self, default_config_parameters: dict): + super().__init__(default_config_parameters) + self._profile_name = "databricks" + self._default_catalog = self._target_config.get("catalog") + self._athena_dbt_parser = AthenaDBTConfigParser(default_config_parameters) + self._create_external_athena_table = default_config_parameters.get("create_external_athena_table", False) - if node.get("resource_type") == "seed": - task = self._get_dummy_task(node) - dagger_tasks.append(task) - elif node.get("resource_type") == "source": - athena_task = self._get_athena_task(node, follow_external_dependency=True) - dagger_tasks.append(athena_task) - elif node.get("config", {}).get("materialized") == "ephemeral": - task = self._get_dummy_task(node, follow_external_dependency=True) - dagger_tasks.append(task) + def _get_table_task( + self, node: dict, follow_external_dependency: bool = False + ) -> dict: + """ + Generates the dagger databricks task for the DBT model node + """ + task = DATABRICKS_TASK_BASE.copy() + if follow_external_dependency: + task["follow_external_dependency"] = True - ephemeral_parent_node_names = node.get("depends_on", {}).get("nodes", []) - for node_name in ephemeral_parent_node_names: - dagger_tasks += self._generate_dagger_tasks(node_name) - elif node.get("name").startswith("stg_"): - dagger_tasks.append( - self._get_dummy_task(node, follow_external_dependency=True) - ) - else: - athena_task = self._get_athena_task(node, follow_external_dependency=True) - s3_task = self._get_s3_task(node) + task["catalog"] = node.get("database", self._default_catalog) + task["schema"] = node.get("schema", self._default_schema) + task["table"] = node.get("name", "") + task[ + "name" + ] = f"{task['catalog']}__{task['schema']}__{task['table']}_databricks" - dagger_tasks.append(athena_task) - dagger_tasks.append(s3_task) + return task - return dagger_tasks + def _get_athena_table_task(self, node: dict, follow_external_dependency: bool = False) -> dict: + return self._athena_dbt_parser._get_table_task(node, follow_external_dependency) def _get_model_data_location( - self, node: dict, schema: str, dbt_model_name: str + self, node: dict, schema: str, model_name: str ) -> Tuple[str, str]: """ Gets the S3 path of the dbt model relative to the data bucket. - If external location is not specified in the DBT model config, then the default data directory from the - DBT profiles configuration is used. - Args: - node: The extracted node from the manifest.json file - schema: The schema of the dbt model - dbt_model_name: The name of the dbt model - - Returns: - str: The relative S3 path of the dbt model relative to the data bucket - """ - location = node.get("config", {}).get("external_location") - if not location: - location = join(self._default_data_dir, schema, dbt_model_name) - + location_root = node.get("config", {}).get("location_root") + location = join(location_root, schema, model_name) split = location.split("//")[1].split("/") bucket_name, data_path = split[0], "/".join(split[1:]) return bucket_name, data_path - def generate_dagger_io(self, model_name: str) -> Tuple[List[dict], List[dict]]: + def _get_s3_task(self, node: dict) -> dict: """ - Parse through all the parents of the DBT model and return the dagger inputs and outputs for the DBT model - Args: - model_name: The name of the DBT model - - Returns: - Tuple[list, list]: The dagger inputs and outputs for the DBT model - + Generates the dagger s3 task for the databricks-dbt model node """ - inputs_list = [] - - model_node = self._nodes_in_manifest[f"model.main.{model_name}"] - - parent_node_names = model_node.get("depends_on", {}).get("nodes", []) + task = S3_TASK_BASE.copy() - for parent_node_name in parent_node_names: - dagger_input = self._generate_dagger_tasks(parent_node_name) + catalog = node.get("database", self._default_catalog) + schema = node.get("schema", self._default_schema) + table = node.get("name", "") + task["name"] = f"{catalog}__{schema}__{table}_s3" + task["bucket"], task["path"] = self._get_model_data_location( + node, schema, table + ) - inputs_list += dagger_input + return task - output_list = self._generate_dagger_output(model_node) + def _generate_dagger_output(self, node: dict): + """ + Generates the dagger output for the DBT model node with the databricks-dbt adapter. + If the model is materialized as a view or ephemeral, then a dummy task is created. + Otherwise, and databricks and s3 task is created for the DBT model node. + And if create_external_athena_table is True te an extra athena task is created. + Args: + node: The extracted node from the manifest.json file - unique_inputs = list( - OrderedDict( - (frozenset(item.items()), item) for item in inputs_list - ).values() - ) + Returns: + dict: The dagger output, which is a combination of an athena and s3 task for the DBT model node - return unique_inputs, output_list + """ + if node.get("config", {}).get("materialized") in ( + "view", + "ephemeral", + ) or node.get("name").startswith("stg_"): + return [self._get_dummy_task(node)] + else: + output_tasks = [self._get_table_task(node), self._get_s3_task(node)] + if self._create_external_athena_table: + output_tasks.append(self._get_athena_table_task(node)) + return output_tasks diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index d565ffe..3cb261d 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -1,7 +1,7 @@ import logging from os import path from mergedeep import merge -from dagger.utilities.dbt_config_parser import DBTConfigParser +from dagger.utilities.dbt_config_parser import AthenaDBTConfigParser, DatabricksDBTConfigParser import yaml @@ -26,7 +26,10 @@ def __init__(self, path_to_config, target_dir): self._default_parameters = config.get("default_parameters", {}) if 'dbt' in self._tasks.keys(): - self._dbt_module = DBTConfigParser(self._default_parameters) + if self._default_parameters.get('profile_name') == 'athena': + self._dbt_module = AthenaDBTConfigParser(self._default_parameters) + if self._default_parameters.get('profile_name') == 'databricks': + self._dbt_module = DatabricksDBTConfigParser(self._default_parameters) @staticmethod def read_yaml(yaml_str): From 9274fe222618c7dabb620aac3b468d41d9f5179d Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 26 Apr 2024 10:48:31 +0200 Subject: [PATCH 071/134] feat: add unit test for databricks config parser --- ...y => dbt_config_parser_fixtures_athena.py} | 40 +- .../dbt_config_parser_fixtures_databricks.py | 385 ++++++++++++++++++ tests/utilities/test_dbt_config_parser.py | 98 ++++- 3 files changed, 485 insertions(+), 38 deletions(-) rename tests/fixtures/modules/{dbt_config_parser_fixtures.py => dbt_config_parser_fixtures_athena.py} (99%) create mode 100644 tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures.py b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py similarity index 99% rename from tests/fixtures/modules/dbt_config_parser_fixtures.py rename to tests/fixtures/modules/dbt_config_parser_fixtures_athena.py index a28d871..66005e6 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py @@ -1,3 +1,23 @@ +DBT_PROFILE_FIXTURE = { + "athena": { + "outputs": { + "data": { + "aws_profile_name": "data", + "database": "awsdatacatalog", + "num_retries": 10, + "region_name": "eu-west-1", + "s3_data_dir": "s3://bucket1-data-lake/path1/tmp", + "s3_data_naming": "schema_table", + "s3_staging_dir": "s3://bucket1-data-lake/path1/", + "schema": "analytics_engineering", + "threads": 4, + "type": "athena", + "work_group": "primary", + }, + } + } +} + DBT_MANIFEST_FILE_FIXTURE = { "nodes": { "model.main.model1": { @@ -162,26 +182,6 @@ }, } -DBT_PROFILE_FIXTURE = { - "main": { - "outputs": { - "data": { - "aws_profile_name": "data", - "database": "awsdatacatalog", - "num_retries": 10, - "region_name": "eu-west-1", - "s3_data_dir": "s3://bucket1-data-lake/path1/tmp", - "s3_data_naming": "schema_table", - "s3_staging_dir": "s3://bucket1-data-lake/path1/", - "schema": "analytics_engineering", - "threads": 4, - "type": "athena", - "work_group": "primary", - }, - } - } -} - EXPECTED_STAGING_NODE = [ { "name": "stg_core_schema1__table1", diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py new file mode 100644 index 0000000..5c6e0a4 --- /dev/null +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py @@ -0,0 +1,385 @@ +DATABRICKS_DBT_PROFILE_FIXTURE = { + "databricks": { + "outputs": { + "data": { + "type": "databricks", + "catalog": "hive_metastore", + "schema": "analytics_engineering", + "host": "xxx.databricks.com", + "http_path": "/sql/1.0/warehouses/xxx", + "token": "{{ env_var('SECRETDATABRICKS') }}" + }, + } + + } +} + +DATABRICKS_DBT_MANIFEST_FILE_FIXTURE = { + "nodes": { + "model.main.model1": { + "database": "marts", + "schema": "analytics_engineering", + "name": "model1", + "unique_id": "model.main.model1", + "resource_type": "model", + "config": { + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts", + "materialized": "incremental", + "incremental_strategy": "insert_overwrite", + }, + "description": "Details of revenue calculation at supplier level for each observation day", + "tags": ["daily"], + "unrendered_config": { + "materialized": "incremental", + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts", + "incremental_strategy": "insert_overwrite", + "partitioned_by": ["year", "month", "day", "dt"], + "tags": ["daily"], + "on_schema_change": "fail", + }, + "depends_on": { + "macros": [ + "macro.main.macro1", + "macro.main.macro2", + ], + "nodes": [ + "model.main.stg_core_schema2__table2", + "model.main.model2", + "model.main.int_model3", + "seed.main.seed_buyer_country_overwrite", + ], + }, + }, + "model.main.stg_core_schema1__table1": { + "database": "hive_metastore", + "schema": "data_preparation", + "name": "stg_core_schema1__table1", + "unique_id": "model.main.stg_core_schema1__table1", + "resource_type": "model", + "config": { + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/preparation", + "materialized": "view", + }, + "depends_on": { + "macros": [], + "nodes": ["source.main.core_schema1.table1"], + }, + }, + "model.main.stg_core_schema2__table2": { + "database": "hive_metastore", + "schema": "data_preparation", + "name": "stg_core_schema2__table2", + "unique_id": "model.main.stg_core_schema2__table2", + "resource_type": "model", + "config": { + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/preparation", + "materialized": "view", + }, + "depends_on": { + "macros": [], + "nodes": [ + "source.main.core_schema2.table2", + "source.main.core_schema2.table3", + "seed.main.seed_buyer_country_overwrite", + ], + }, + }, + "model.main.model2": { + "database": "marts", + "schema": "analytics_engineering", + "name": "model2", + "unique_id": "model.main.model2", + "resource_type": "model", + "config": { + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts", + "materialized": "table", + }, + "depends_on": {"macros": [], "nodes": []}, + }, + "model.main.int_model3": { + "name": "int_model3", + "unique_id": "model.main.int_model3", + "database": "intermediate", + "schema": "analytics_engineering", + "resource_type": "model", + "config": { + "materialized": "ephemeral", + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/intermediate", + }, + "depends_on": { + "macros": [], + "nodes": ["model.main.int_model2"], + }, + }, + "seed.main.seed_buyer_country_overwrite": { + "database": "hive_metastore", + "schema": "datastg_preparation", + "name": "seed_buyer_country_overwrite", + "unique_id": "seed.main.seed_buyer_country_overwrite", + "resource_type": "seed", + "alias": "seed_buyer_country_overwrite", + "tags": ["analytics"], + "description": "", + "created_at": 1700216177.105391, + "depends_on": {"macros": []}, + }, + "model.main.model3": { + "name": "model3", + "database": "marts", + "schema": "analytics_engineering", + "unique_id": "model.main.model3", + "config": { + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts", + }, + "depends_on": { + "macros": [], + "nodes": [ + "model.main.int_model3", + "model.main.model2", + "seed.main.seed_buyer_country_overwrite", + "model.main.stg_core_schema2__table2", + ], + }, + }, + "model.main.int_model2": { + "name": "int_model2", + "unique_id": "model.main.int_model2", + "database": "intermediate", + "schema": "analytics_engineering", + "config": { + "materialized": "ephemeral", + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/intermediate", + }, + "depends_on": { + "macros": [], + "nodes": [ + "seed.main.seed_buyer_country_overwrite", + "model.main.stg_core_schema1__table1", + ], + }, + }, + }, + "sources": { + "source.main.core_schema1.table1": { + "source_name": "table1", + "database": "hive_metastore", + "schema": "core_schema1", + "resource_type": "source", + "unique_id": "source.main.core_schema1.table1", + "name": "table1", + "tags": ["analytics"], + "description": "", + }, + "source.main.core_schema2.table2": { + "source_name": "table2", + "database": "hive_metastore", + "schema": "core_schema2", + "resource_type": "source", + "unique_id": "source.main.core_schema2.table2", + "name": "table2", + "tags": ["analytics"], + "description": "", + }, + "source.main.core_schema2.table3": { + "source_name": "table3", + "database": "hive_metastore", + "schema": "core_schema2", + "resource_type": "source", + "unique_id": "source.main.core_schema2.table3", + "name": "table3", + "tags": ["analytics"], + "description": "", + }, + }, +} + +DATABRICKS_EXPECTED_STAGING_NODE = [ + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, + }, +] + +DATABRICKS_EXPECTED_SEED_NODE = [ + { + "type": "dummy", + "name": "seed_buyer_country_overwrite", + } +] + +DATABRICKS_EXPECTED_MODEL_MULTIPLE_DEPENDENCIES = [ + { + "type": "dummy", + "name": "int_model3", + "follow_external_dependency": True, + }, + { + "type": "dummy", + "name": "int_model2", + "follow_external_dependency": True, + }, + { + "type": "dummy", + "name": "seed_buyer_country_overwrite", + }, + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, + }, + { + "type": "databricks", + "name": "marts__analytics_engineering__model2_databricks", + "catalog": "marts", + "schema": "analytics_engineering", + "table": "model2", + "follow_external_dependency": True, + }, + { + "bucket": "chodata-data-lake", + "name": "marts__analytics_engineering__model2_s3", + "path": "analytics_warehouse/data/marts/analytics_engineering/model2", + "type": "s3", + }, + { + "name": "stg_core_schema2__table2", + "type": "dummy", + "follow_external_dependency": True, + }, +] + +DATABRICKS_EXPECTED_EPHEMERAL_NODE = [ + { + "type": "dummy", + "name": "int_model3", + "follow_external_dependency": True, + }, + { + "type": "dummy", + "name": "int_model2", + "follow_external_dependency": True, + }, + { + "type": "dummy", + "name": "seed_buyer_country_overwrite", + }, + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, + } +] + +DATABRICKS_EXPECTED_MODEL_NODE = [ + { + "type": "databricks", + "name": "marts__analytics_engineering__model1_databricks", + "catalog": "marts", + "schema": "analytics_engineering", + "table": "model1", + "follow_external_dependency": True, + }, + { + "bucket": "chodata-data-lake", + "name": "marts__analytics_engineering__model1_s3", + "path": "analytics_warehouse/data/marts/analytics_engineering/model1", + "type": "s3", + }, +] + +DATABRICKS_EXPECTED_DAGGER_INPUTS = [ + { + "name": "stg_core_schema2__table2", + "type": "dummy", + "follow_external_dependency": True, + }, + { + "name": "marts__analytics_engineering__model2_databricks", + "catalog": "marts", + "schema": "analytics_engineering", + "table": "model2", + "type": "databricks", + "follow_external_dependency": True, + }, + { + "bucket": "chodata-data-lake", + "name": "marts__analytics_engineering__model2_s3", + "path": "analytics_warehouse/data/marts/analytics_engineering/model2", + "type": "s3", + }, + { + "type": "dummy", + "name": "int_model3", + "follow_external_dependency": True, + }, + { + "type": "dummy", + "name": "int_model2", + "follow_external_dependency": True, + }, + {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, + }, +] + +DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS = [ + { + "follow_external_dependency": True, + "name": "core_schema2__table2_athena", + "schema": "core_schema2", + "table": "table2", + "type": "athena", + }, + { + "follow_external_dependency": True, + "name": "core_schema2__table3_athena", + "schema": "core_schema2", + "table": "table3", + "type": "athena", + }, + {"name": "seed_buyer_country_overwrite", "type": "dummy"}, +] + +DATABRICKS_EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS = [ + {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "name": "stg_core_schema1__table1", + "type": "dummy", + "follow_external_dependency": True, + }, +] + +DATABRICKS_EXPECTED_DAGGER_OUTPUTS = [ + { + "name": "marts__analytics_engineering__model1_databricks", + "catalog": "marts", + "schema": "analytics_engineering", + "table": "model1", + "type": "databricks", + }, + { + "bucket": "chodata-data-lake", + "name": "marts__analytics_engineering__model1_s3", + "path": "analytics_warehouse/data/marts/analytics_engineering/model1", + "type": "s3", + }, + { + "name": "analytics_engineering__model1_athena", + "schema": "analytics_engineering", + "table": "model1", + "type": "athena", + } +] + +DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS = [ + { + "type": "dummy", + "name": "stg_core_schema2__table2", + }, +] + + diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 3fd6394..8c188d3 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -3,22 +3,10 @@ from unittest import skip from unittest.mock import patch, MagicMock -from dagger.utilities.dbt_config_parser import DBTConfigParser +from dagger.utilities.dbt_config_parser import AthenaDBTConfigParser, DatabricksDBTConfigParser from dagger.utilities.module import Module -from tests.fixtures.modules.dbt_config_parser_fixtures import ( - EXPECTED_DAGGER_OUTPUTS, - EXPECTED_DAGGER_INPUTS, - DBT_MANIFEST_FILE_FIXTURE, - DBT_PROFILE_FIXTURE, - EXPECTED_STAGING_NODE, - EXPECTED_SEED_NODE, - EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, - EXPECTED_EPHEMERAL_NODE, - EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS, - EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS, - EXPECTED_MODEL_NODE, - EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS -) +from tests.fixtures.modules.dbt_config_parser_fixtures_athena import * +from tests.fixtures.modules.dbt_config_parser_fixtures_databricks import * _logger = logging.getLogger("root") @@ -26,17 +14,25 @@ "data_bucket": "bucket1-data-lake", "project_dir": "main", "profile_dir": ".dbt", - "dbt_profile": "data", + "profile_name": "athena", + "target_name": "data", +} +DATABRICKS_DEFAULT_CONFIG_PARAMS = { + "project_dir": "main", + "profile_dir": ".dbt", + "profile_name": "databricks", + "target_name": "data", + "create_external_athena_table": True, } MODEL_NAME = "model1" -class TestDBTConfigParser(unittest.TestCase): +class TestAthenaDBTConfigParser(unittest.TestCase): @patch("builtins.open", new_callable=MagicMock, read_data=DBT_MANIFEST_FILE_FIXTURE) @patch("json.loads", return_value=DBT_MANIFEST_FILE_FIXTURE) @patch("yaml.safe_load", return_value=DBT_PROFILE_FIXTURE) def setUp(self, mock_open, mock_json_load, mock_safe_load): - self._dbt_config_parser = DBTConfigParser(DEFAULT_CONFIG_PARAMS) + self._dbt_config_parser = AthenaDBTConfigParser(DEFAULT_CONFIG_PARAMS) self._sample_dbt_node = DBT_MANIFEST_FILE_FIXTURE["nodes"]["model.main.model1"] @skip("Run only locally") @@ -95,3 +91,69 @@ def test_generate_io_outputs(self): _, result = self._dbt_config_parser.generate_dagger_io(mock_input) self.assertListEqual(result, expected_output) + + +class TestDatabricksDBTConfigParser(unittest.TestCase): + @patch("builtins.open", new_callable=MagicMock, read_data=DATABRICKS_DBT_MANIFEST_FILE_FIXTURE) + @patch("json.loads", return_value=DATABRICKS_DBT_MANIFEST_FILE_FIXTURE) + @patch("yaml.safe_load", return_value=DATABRICKS_DBT_PROFILE_FIXTURE) + def setUp(self, mock_open, mock_json_load, mock_safe_load): + self._dbt_config_parser = DatabricksDBTConfigParser(DATABRICKS_DEFAULT_CONFIG_PARAMS) + self._sample_dbt_node = DATABRICKS_DBT_MANIFEST_FILE_FIXTURE["nodes"]["model.main.model1"] + + @skip("Run only locally") + def test_generate_task_configs(self): + module = Module( + path_to_config="./tests/fixtures/modules/dbt_test_config.yaml", + target_dir="./tests/fixtures/modules/", + ) + + module.generate_task_configs() + + def test_generate_dagger_tasks(self): + test_inputs = [ + ( + "model.main.stg_core_schema1__table1", + DATABRICKS_EXPECTED_STAGING_NODE, + ), + ( + "seed.main.seed_buyer_country_overwrite", + DATABRICKS_EXPECTED_SEED_NODE, + ), + ( + "model.main.int_model3", + DATABRICKS_EXPECTED_EPHEMERAL_NODE, + ), + ( + "model.main.model1", + DATABRICKS_EXPECTED_MODEL_NODE, + ), + ] + for mock_input, expected_output in test_inputs: + result = self._dbt_config_parser._generate_dagger_tasks(mock_input) + self.assertListEqual(result, expected_output) + + def test_generate_io_inputs(self): + fixtures = [ + ("model1", DATABRICKS_EXPECTED_DAGGER_INPUTS), + ( + "model3", + DATABRICKS_EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, + ), + ("stg_core_schema2__table2", DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS), + ("int_model2", DATABRICKS_EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS), + ] + for mock_input, expected_output in fixtures: + result, _ = self._dbt_config_parser.generate_dagger_io(mock_input) + + self.assertListEqual(result, expected_output) + + def test_generate_io_outputs(self): + fixtures = [ + ("model1", DATABRICKS_EXPECTED_DAGGER_OUTPUTS), + ("stg_core_schema2__table2", DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS), + ] + for mock_input, expected_output in fixtures: + _, result = self._dbt_config_parser.generate_dagger_io(mock_input) + + self.assertListEqual(result, expected_output) From e951c474b82975ba6a40bb3039ff300719a9441d Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 26 Apr 2024 10:53:11 +0200 Subject: [PATCH 072/134] chore: black --- dagger/pipeline/ios/databricks_io.py | 4 +-- dagger/pipeline/tasks/dbt_task.py | 10 ++++--- dagger/utilities/dbt_config_parser.py | 23 ++++++++++----- dagger/utilities/module.py | 11 ++++--- .../dbt_config_parser_fixtures_athena.py | 2 +- .../dbt_config_parser_fixtures_databricks.py | 9 ++---- tests/utilities/test_dbt_config_parser.py | 29 +++++++++++++++---- 7 files changed, 57 insertions(+), 31 deletions(-) diff --git a/dagger/pipeline/ios/databricks_io.py b/dagger/pipeline/ios/databricks_io.py index dd9041b..15be2c1 100644 --- a/dagger/pipeline/ios/databricks_io.py +++ b/dagger/pipeline/ios/databricks_io.py @@ -10,9 +10,7 @@ def init_attributes(cls, orig_cls): cls.add_config_attributes( [ Attribute(attribute_name="catalog"), - Attribute( - attribute_name="schema" - ), + Attribute(attribute_name="schema"), Attribute(attribute_name="table"), ] ) diff --git a/dagger/pipeline/tasks/dbt_task.py b/dagger/pipeline/tasks/dbt_task.py index aea0945..e59ea5a 100644 --- a/dagger/pipeline/tasks/dbt_task.py +++ b/dagger/pipeline/tasks/dbt_task.py @@ -28,7 +28,7 @@ def init_attributes(cls, orig_cls): attribute_name="target_name", parent_fields=["task_parameters"], comment="Which target to load for the given profile " - "(--target dbt option). Default is 'default'", + "(--target dbt option). Default is 'default'", ), Attribute( attribute_name="select", @@ -52,7 +52,7 @@ def init_attributes(cls, orig_cls): required=False, parent_fields=["task_parameters"], comment="Specify whether to create an external Athena table for the model", - ) + ), ] ) @@ -66,7 +66,9 @@ def __init__(self, name, pipeline_name, pipeline, job_config): self._select = self.parse_attribute("select") self._dbt_command = self.parse_attribute("dbt_command") self._vars = self.parse_attribute("vars") - self._create_external_athena_table = self.parse_attribute("create_external_athena_table") + self._create_external_athena_table = self.parse_attribute( + "create_external_athena_table" + ) @property def project_dir(self): @@ -98,4 +100,4 @@ def vars(self): @property def create_external_athena_table(self): - return self._create_external_athena_table \ No newline at end of file + return self._create_external_athena_table diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 6c2ae5d..11c4325 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -134,7 +134,9 @@ def _generate_dagger_tasks(self, node_name: str) -> List[Dict]: task = self._get_dummy_task(node) dagger_tasks.append(task) elif node.get("resource_type") == "source": - table_task = self._get_athena_table_task(node, follow_external_dependency=True) + table_task = self._get_athena_table_task( + node, follow_external_dependency=True + ) dagger_tasks.append(table_task) elif node.get("config", {}).get("materialized") == "ephemeral": task = self._get_dummy_task(node, follow_external_dependency=True) @@ -192,6 +194,7 @@ def generate_dagger_io(self, model_name: str) -> Tuple[List[dict], List[dict]]: class AthenaDBTConfigParser(DBTConfigParser): """Implementation for Athena configurations.""" + def __init__(self, default_config_parameters: dict): super().__init__(default_config_parameters) self._profile_name = "athena" @@ -200,7 +203,6 @@ def __init__(self, default_config_parameters: dict): "s3_data_dir" ) or self._target_config.get("s3_staging_dir") - def _get_table_task( self, node: dict, follow_external_dependency: bool = False ) -> dict: @@ -217,7 +219,9 @@ def _get_table_task( return task - def _get_athena_table_task(self, node: dict, follow_external_dependency: bool = False) -> dict: + def _get_athena_table_task( + self, node: dict, follow_external_dependency: bool = False + ) -> dict: return self._get_table_task(node, follow_external_dependency) def _get_model_data_location( @@ -247,11 +251,12 @@ def _get_s3_task(self, node: dict) -> dict: """ task = S3_TASK_BASE.copy() - schema = node.get("schema", self._default_schema) table = node.get("name", "") task["name"] = f"{schema}__{table}_s3" - task["bucket"], task["path"] = self._get_model_data_location(node, schema, table) + task["bucket"], task["path"] = self._get_model_data_location( + node, schema, table + ) return task def _generate_dagger_output(self, node: dict): @@ -282,7 +287,9 @@ def __init__(self, default_config_parameters: dict): self._profile_name = "databricks" self._default_catalog = self._target_config.get("catalog") self._athena_dbt_parser = AthenaDBTConfigParser(default_config_parameters) - self._create_external_athena_table = default_config_parameters.get("create_external_athena_table", False) + self._create_external_athena_table = default_config_parameters.get( + "create_external_athena_table", False + ) def _get_table_task( self, node: dict, follow_external_dependency: bool = False @@ -303,7 +310,9 @@ def _get_table_task( return task - def _get_athena_table_task(self, node: dict, follow_external_dependency: bool = False) -> dict: + def _get_athena_table_task( + self, node: dict, follow_external_dependency: bool = False + ) -> dict: return self._athena_dbt_parser._get_table_task(node, follow_external_dependency) def _get_model_data_location( diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 3cb261d..6b6aa86 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -1,7 +1,10 @@ import logging from os import path from mergedeep import merge -from dagger.utilities.dbt_config_parser import AthenaDBTConfigParser, DatabricksDBTConfigParser +from dagger.utilities.dbt_config_parser import ( + AthenaDBTConfigParser, + DatabricksDBTConfigParser, +) import yaml @@ -25,10 +28,10 @@ def __init__(self, path_to_config, target_dir): self._override_parameters = config.get("override_parameters", {}) self._default_parameters = config.get("default_parameters", {}) - if 'dbt' in self._tasks.keys(): - if self._default_parameters.get('profile_name') == 'athena': + if "dbt" in self._tasks.keys(): + if self._default_parameters.get("profile_name") == "athena": self._dbt_module = AthenaDBTConfigParser(self._default_parameters) - if self._default_parameters.get('profile_name') == 'databricks': + if self._default_parameters.get("profile_name") == "databricks": self._dbt_module = DatabricksDBTConfigParser(self._default_parameters) @staticmethod diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py index 66005e6..5f44af4 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py @@ -256,7 +256,7 @@ "name": "stg_core_schema1__table1", "type": "dummy", "follow_external_dependency": True, - } + }, ] EXPECTED_MODEL_NODE = [ diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py index 5c6e0a4..94a387f 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py @@ -7,10 +7,9 @@ "schema": "analytics_engineering", "host": "xxx.databricks.com", "http_path": "/sql/1.0/warehouses/xxx", - "token": "{{ env_var('SECRETDATABRICKS') }}" + "token": "{{ env_var('SECRETDATABRICKS') }}", }, } - } } @@ -268,7 +267,7 @@ "name": "stg_core_schema1__table1", "type": "dummy", "follow_external_dependency": True, - } + }, ] DATABRICKS_EXPECTED_MODEL_NODE = [ @@ -372,7 +371,7 @@ "schema": "analytics_engineering", "table": "model1", "type": "athena", - } + }, ] DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS = [ @@ -381,5 +380,3 @@ "name": "stg_core_schema2__table2", }, ] - - diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 8c188d3..9e4d18f 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -3,7 +3,10 @@ from unittest import skip from unittest.mock import patch, MagicMock -from dagger.utilities.dbt_config_parser import AthenaDBTConfigParser, DatabricksDBTConfigParser +from dagger.utilities.dbt_config_parser import ( + AthenaDBTConfigParser, + DatabricksDBTConfigParser, +) from dagger.utilities.module import Module from tests.fixtures.modules.dbt_config_parser_fixtures_athena import * from tests.fixtures.modules.dbt_config_parser_fixtures_databricks import * @@ -94,12 +97,20 @@ def test_generate_io_outputs(self): class TestDatabricksDBTConfigParser(unittest.TestCase): - @patch("builtins.open", new_callable=MagicMock, read_data=DATABRICKS_DBT_MANIFEST_FILE_FIXTURE) + @patch( + "builtins.open", + new_callable=MagicMock, + read_data=DATABRICKS_DBT_MANIFEST_FILE_FIXTURE, + ) @patch("json.loads", return_value=DATABRICKS_DBT_MANIFEST_FILE_FIXTURE) @patch("yaml.safe_load", return_value=DATABRICKS_DBT_PROFILE_FIXTURE) def setUp(self, mock_open, mock_json_load, mock_safe_load): - self._dbt_config_parser = DatabricksDBTConfigParser(DATABRICKS_DEFAULT_CONFIG_PARAMS) - self._sample_dbt_node = DATABRICKS_DBT_MANIFEST_FILE_FIXTURE["nodes"]["model.main.model1"] + self._dbt_config_parser = DatabricksDBTConfigParser( + DATABRICKS_DEFAULT_CONFIG_PARAMS + ) + self._sample_dbt_node = DATABRICKS_DBT_MANIFEST_FILE_FIXTURE["nodes"][ + "model.main.model1" + ] @skip("Run only locally") def test_generate_task_configs(self): @@ -140,7 +151,10 @@ def test_generate_io_inputs(self): "model3", DATABRICKS_EXPECTED_MODEL_MULTIPLE_DEPENDENCIES, ), - ("stg_core_schema2__table2", DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS), + ( + "stg_core_schema2__table2", + DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS, + ), ("int_model2", DATABRICKS_EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS), ] for mock_input, expected_output in fixtures: @@ -151,7 +165,10 @@ def test_generate_io_inputs(self): def test_generate_io_outputs(self): fixtures = [ ("model1", DATABRICKS_EXPECTED_DAGGER_OUTPUTS), - ("stg_core_schema2__table2", DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS), + ( + "stg_core_schema2__table2", + DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS, + ), ] for mock_input, expected_output in fixtures: _, result = self._dbt_config_parser.generate_dagger_io(mock_input) From 4567b3e02266ab897c1a26c2edb1a85342065b5a Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Fri, 26 Apr 2024 12:48:23 +0200 Subject: [PATCH 073/134] Switching to official batch operator --- .../operator_creators/batch_creator.py | 28 +- .../operator_creators/spark_creator.py | 2 +- .../airflow/operators/awsbatch_operator.py | 275 ++++++------------ 3 files changed, 105 insertions(+), 200 deletions(-) diff --git a/dagger/dag_creator/airflow/operator_creators/batch_creator.py b/dagger/dag_creator/airflow/operator_creators/batch_creator.py index a3d2534..0cfe9fb 100644 --- a/dagger/dag_creator/airflow/operator_creators/batch_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/batch_creator.py @@ -1,5 +1,9 @@ +from pathlib import Path +from datetime import timedelta + from dagger.dag_creator.airflow.operator_creator import OperatorCreator from dagger.dag_creator.airflow.operators.awsbatch_operator import AWSBatchOperator +from dagger import conf class BatchCreator(OperatorCreator): @@ -8,6 +12,20 @@ class BatchCreator(OperatorCreator): def __init__(self, task, dag): super().__init__(task, dag) + @staticmethod + def _validate_job_name(job_name, absolute_job_name): + if not absolute_job_name and not job_name: + raise Exception("Both job_name and absolute_job_name cannot be null") + + if absolute_job_name is not None: + return absolute_job_name + + job_path = Path(conf.DAGS_DIR) / job_name.replace("-", "/") + assert ( + job_path.is_dir() + ), f"Job name `{job_name}`, points to a non-existing folder `{job_path}`" + return job_name + def _generate_command(self): command = [self._task.executable_prefix, self._task.executable] for param_name, param_value in self._template_parameters.items(): @@ -21,16 +39,16 @@ def _create_operator(self, **kwargs): overrides = self._task.overrides overrides.update({"command": self._generate_command()}) + job_name = self._validate_job_name(self._task.job_name, self._task.absolute_job_name) batch_op = AWSBatchOperator( dag=self._dag, task_id=self._task.name, - job_name=self._task.job_name, - absolute_job_name=self._task.absolute_job_name, + job_name=self._task.name, + job_definition=job_name, region_name=self._task.region_name, - cluster_name=self._task.cluster_name, job_queue=self._task.job_queue, - overrides=overrides, + container_overrides=overrides, + awslogs_enabled=True, **kwargs, ) - return batch_op diff --git a/dagger/dag_creator/airflow/operator_creators/spark_creator.py b/dagger/dag_creator/airflow/operator_creators/spark_creator.py index 2bb41e9..c48ebda 100644 --- a/dagger/dag_creator/airflow/operator_creators/spark_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/spark_creator.py @@ -113,7 +113,7 @@ def _create_operator(self, **kwargs): job_name=job_name, region_name=self._task.region_name, job_queue=self._task.job_queue, - overrides=overrides, + container_overrides=overrides, **kwargs, ) elif self._task.spark_engine == "glue": diff --git a/dagger/dag_creator/airflow/operators/awsbatch_operator.py b/dagger/dag_creator/airflow/operators/awsbatch_operator.py index a267ba7..b2f4bb3 100644 --- a/dagger/dag_creator/airflow/operators/awsbatch_operator.py +++ b/dagger/dag_creator/airflow/operators/awsbatch_operator.py @@ -1,203 +1,90 @@ -from pathlib import Path -from time import sleep - -from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook +from airflow.providers.amazon.aws.operators.batch import BatchOperator +from airflow.utils.context import Context from airflow.exceptions import AirflowException -from airflow.utils.decorators import apply_defaults - -from dagger.dag_creator.airflow.operators.dagger_base_operator import DaggerBaseOperator -from dagger.dag_creator.airflow.utils.decorators import lazy_property -from dagger import conf - - -class AWSBatchOperator(DaggerBaseOperator): - """ - Execute a job on AWS Batch Service - - .. warning: the queue parameter was renamed to job_queue to segregate the - internal CeleryExecutor queue from the AWS Batch internal queue. - - :param job_name: the name for the job that will run on AWS Batch - :type job_name: str - :param job_definition: the job definition name on AWS Batch - :type job_definition: str - :param job_queue: the queue name on AWS Batch - :type job_queue: str - :param overrides: the same parameter that boto3 will receive on - containerOverrides (templated): - http://boto3.readthedocs.io/en/latest/reference/services/batch.html#submit_job - :type overrides: dict - :param max_retries: exponential backoff retries while waiter is not - merged, 4200 = 48 hours - :type max_retries: int - :param aws_conn_id: connection id of AWS credentials / region name. If None, - credential boto3 strategy will be used - (http://boto3.readthedocs.io/en/latest/guide/configuration.html). - :type aws_conn_id: str - :param region_name: region name to use in AWS Hook. - Override the region_name in connection (if provided) - :type region_name: str - :param cluster_name: Batch cluster short name or arn - :type region_name: str - - """ - - ui_color = "#c3dae0" - client = None - arn = None - template_fields = ("overrides",) +from airflow.providers.amazon.aws.links.batch import ( + BatchJobDefinitionLink, + BatchJobQueueLink, +) +from airflow.providers.amazon.aws.links.logs import CloudWatchEventsLink - @apply_defaults - def __init__( - self, - job_queue, - job_name=None, - absolute_job_name=None, - overrides=None, - job_definition=None, - aws_conn_id=None, - region_name=None, - cluster_name=None, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) - self.job_name = self._validate_job_name(job_name, absolute_job_name) - self.aws_conn_id = aws_conn_id - self.region_name = region_name - self.cluster_name = cluster_name - self.job_definition = job_definition or self.job_name - self.job_queue = job_queue - self.overrides = overrides or {} - self.job_id = None - - @lazy_property - def batch_client(self): - return AwsBaseHook(aws_conn_id=self.aws_conn_id, client_type="batch").get_client_type( - region_name=self.region_name) - - @lazy_property - def logs_client(self): - return AwsBaseHook(aws_conn_id=self.aws_conn_id, client_type="logs").get_client_type( - region_name=self.region_name) - - @lazy_property - def ecs_client(self): - return AwsBaseHook(aws_conn_id=self.aws_conn_id, client_type="ecs").get_client_type( - region_name=self.region_name) +class AWSBatchOperator(AWSBatchOperator): @staticmethod - def _validate_job_name(job_name, absolute_job_name): - if absolute_job_name is None and job_name is None: - raise Exception("Both job_name and absolute_job_name cannot be null") - - if absolute_job_name is not None: - return absolute_job_name - - job_path = Path(conf.DAGS_DIR) / job_name.replace("-", "/") - assert ( - job_path.is_dir() - ), f"Job name `{job_name}`, points to a non-existing folder `{job_path}`" - return job_name - - def execute(self, context): - self.task_instance = context["ti"] - self.log.info( - "\n" - f"\n\tJob name: {self.job_name}" - f"\n\tJob queue: {self.job_queue}" - f"\n\tJob definition: {self.job_definition}" - "\n" - ) - - res = self.batch_client.submit_job( - jobName=self.job_name, - jobQueue=self.job_queue, - jobDefinition=self.job_definition, - containerOverrides=self.overrides, - ) - self.job_id = res["jobId"] - self.log.info( - "\n" - f"\n\tJob ID: {self.job_id}" - "\n" - ) - self.poll_task() - - def poll_task(self): - log_offset = 0 - print_logs_url = True - - while True: - res = self.batch_client.describe_jobs(jobs=[self.job_id]) - - if len(res["jobs"]) == 0: - sleep(3) - continue - - job = res["jobs"][0] - job_status = job["status"] - log_stream_name = job["container"].get("logStreamName") - - if print_logs_url and log_stream_name: - print_logs_url = False - self.log.info( - "\n" - f"\n\tLogs at: https://{self.region_name}.console.aws.amazon.com/cloudwatch/home?" - f"region={self.region_name}#logEventViewer:group=/aws/batch/job;stream={log_stream_name}" - "\n" - ) + def _format_cloudwatch_link(awslogs_region: str, awslogs_group: str, awslogs_stream_name: str): + return f"https://{awslogs_region}.console.aws.amazon.com/cloudwatch/home?region={awslogs_region}#logEventViewer:group={awslogs_group};stream={awslogs_stream_name}" + + def monitor_job(self, context: Context): + """Monitor an AWS Batch job. + + This can raise an exception or an AirflowTaskTimeout if the task was + created with ``execution_timeout``. + """ + if not self.job_id: + raise AirflowException("AWS Batch job - job_id was not found") + + try: + job_desc = self.hook.get_job_description(self.job_id) + job_definition_arn = job_desc["jobDefinition"] + job_queue_arn = job_desc["jobQueue"] + self.log.info( + "AWS Batch job (%s) Job Definition ARN: %r, Job Queue ARN: %r", + self.job_id, + job_definition_arn, + job_queue_arn, + ) + except KeyError: + self.log.warning("AWS Batch job (%s) can't get Job Definition ARN and Job Queue ARN", self.job_id) + else: + BatchJobDefinitionLink.persist( + context=context, + operator=self, + region_name=self.hook.conn_region_name, + aws_partition=self.hook.conn_partition, + job_definition_arn=job_definition_arn, + ) + BatchJobQueueLink.persist( + context=context, + operator=self, + region_name=self.hook.conn_region_name, + aws_partition=self.hook.conn_partition, + job_queue_arn=job_queue_arn, + ) - if job_status in ("RUNNING", "FAILED", "SUCCEEDED") and log_stream_name: - try: - log_offset = self.print_logs(log_stream_name, log_offset) - except self.logs_client.exceptions.ResourceNotFoundException: - pass + if self.awslogs_enabled: + if self.waiters: + self.waiters.wait_for_job(self.job_id, get_batch_log_fetcher=self._get_batch_log_fetcher) else: - self.log.info(f"Job status: {job_status}") - - if job_status == "FAILED": - status_reason = res["jobs"][0]["statusReason"] - exit_code = res["jobs"][0]["container"].get("exitCode") - reason = res["jobs"][0]["container"].get("reason", "") - failure_msg = f"Status: {status_reason} | Exit code: {exit_code} | Reason: {reason}" - container_instance_arn = job["container"]["containerInstanceArn"] - self.retry_check(container_instance_arn) - raise AirflowException(failure_msg) - - if job_status == "SUCCEEDED": - self.log.info("AWS Batch Job has been successfully executed") - return - - sleep(7.5) + self.hook.wait_for_job(self.job_id, get_batch_log_fetcher=self._get_batch_log_fetcher) + else: + if self.waiters: + self.waiters.wait_for_job(self.job_id) + else: + self.hook.wait_for_job(self.job_id) + + awslogs = [] + try: + awslogs = self.hook.get_job_all_awslogs_info(self.job_id) + except AirflowException as ae: + self.log.warning("Cannot determine where to find the AWS logs for this Batch job: %s", ae) + + if awslogs: + self.log.info("AWS Batch job (%s) CloudWatch Events details found. Links to logs:", self.job_id) + link_builder = CloudWatchEventsLink() + for log in awslogs: + self.log.info(self._format_cloudwatch_link(**log)) + if len(awslogs) > 1: + # there can be several log streams on multi-node jobs + self.log.warning( + "out of all those logs, we can only link to one in the UI. Using the first one." + ) - def retry_check(self, container_instance_arn): - res = self.ecs_client.describe_container_instances( - cluster=self.cluster_name, containerInstances=[container_instance_arn] - ) - instance_status = res["containerInstances"][0]["status"] - if instance_status != "ACTIVE": - self.log.warning( - f"Instance in {instance_status} state: setting the task up for retry..." + CloudWatchEventsLink.persist( + context=context, + operator=self, + region_name=self.hook.conn_region_name, + aws_partition=self.hook.conn_partition, + **awslogs[0], ) - self.retries += self.task_instance.try_number + 1 - self.task_instance.max_tries = self.retries - - def print_logs(self, log_stream_name, log_offset): - logs = self.logs_client.get_log_events( - logGroupName="/aws/batch/job", - logStreamName=log_stream_name, - startFromHead=True, - ) - - for event in logs["events"][log_offset:]: - self.log.info(event["message"]) - - log_offset = len(logs["events"]) - return log_offset - def on_kill(self): - res = self.batch_client.terminate_job( - jobId=self.job_id, reason="Task killed by the user" - ) - self.log.info(res) + self.hook.check_job_success(self.job_id) + self.log.info("AWS Batch job (%s) succeeded", self.job_id) From 1c2ad82672d0837dfbaf3a63b95c13e866fde8d3 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 26 Apr 2024 12:54:26 +0200 Subject: [PATCH 074/134] feat: add another param in dbt task --- dagger/dag_creator/airflow/operator_creators/dbt_creator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py index 4b88fe3..60866c8 100644 --- a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py @@ -17,7 +17,7 @@ def __init__(self, task, dag): self._select = task.select self._dbt_command = task.dbt_command self._vars = task.vars - # self._create_external_athena_table = task.create_external_athena_table + self._create_external_athena_table = task.create_external_athena_table def _generate_command(self): command = [self._task.executable_prefix, self._task.executable] @@ -31,7 +31,7 @@ def _generate_command(self): if self._vars: dbt_vars = json.dumps(self._vars) command.append(f"--vars='{dbt_vars}'") - # if self._create_external_athena_table: - # command.append(f"--create_external_athena_table={self._create_external_athena_table}") + if self._create_external_athena_table: + command.append(f"--create_external_athena_table={self._create_external_athena_table}") return command From a8e647184a76c5dc3be96b4f6938c9a038356282 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Fri, 26 Apr 2024 12:55:35 +0200 Subject: [PATCH 075/134] Complete renaming of classes --- dagger/dag_creator/airflow/operators/awsbatch_operator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/dag_creator/airflow/operators/awsbatch_operator.py b/dagger/dag_creator/airflow/operators/awsbatch_operator.py index b2f4bb3..23b3596 100644 --- a/dagger/dag_creator/airflow/operators/awsbatch_operator.py +++ b/dagger/dag_creator/airflow/operators/awsbatch_operator.py @@ -8,7 +8,7 @@ from airflow.providers.amazon.aws.links.logs import CloudWatchEventsLink -class AWSBatchOperator(AWSBatchOperator): +class AWSBatchOperator(BatchOperator): @staticmethod def _format_cloudwatch_link(awslogs_region: str, awslogs_group: str, awslogs_stream_name: str): return f"https://{awslogs_region}.console.aws.amazon.com/cloudwatch/home?region={awslogs_region}#logEventViewer:group={awslogs_group};stream={awslogs_stream_name}" From 7e29420c3739f16185dbde1674016baed5e63fcf Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 26 Apr 2024 13:35:59 +0200 Subject: [PATCH 076/134] feat: refactor --- dagger/utilities/dbt_config_parser.py | 37 ++++++++------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 11c4325..8f761e3 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -59,12 +59,19 @@ def _load_file(file_path: str, file_type: str) -> dict: _logger.error(f"File not found: {file_path}") exit(1) - @abstractmethod def _get_athena_table_task( self, node: dict, follow_external_dependency: bool = False ) -> dict: - """Generate an athena table task for a DBT node. Must be implemented by subclasses. This function should be deprecated after the source connects with databricks directly""" - pass + """Generate an athena table task for a DBT node.""" + task = ATHENA_TASK_BASE.copy() + if follow_external_dependency: + task["follow_external_dependency"] = True + + task["schema"] = node.get("schema", self._default_schema) + task["table"] = node.get("name", "") + task["name"] = f"{task['schema']}__{task['table']}_athena" + + return task @abstractmethod def _get_table_task( @@ -173,7 +180,6 @@ def generate_dagger_io(self, model_name: str) -> Tuple[List[dict], List[dict]]: inputs_list = [] model_node = self._nodes_in_manifest[f"model.main.{model_name}"] parent_node_names = model_node.get("depends_on", {}).get("nodes", []) - print(f"parent node name: {parent_node_names}") for parent_node_name in parent_node_names: dagger_input = self._generate_dagger_tasks(parent_node_name) @@ -187,8 +193,6 @@ def generate_dagger_io(self, model_name: str) -> Tuple[List[dict], List[dict]]: ).values() ) - print(unique_inputs) - return unique_inputs, output_list @@ -209,20 +213,7 @@ def _get_table_task( """ Generates the dagger athena task for the DBT model node """ - task = ATHENA_TASK_BASE.copy() - if follow_external_dependency: - task["follow_external_dependency"] = True - - task["schema"] = node.get("schema", self._default_schema) - task["table"] = node.get("name", "") - task["name"] = f"{task['schema']}__{task['table']}_athena" - - return task - - def _get_athena_table_task( - self, node: dict, follow_external_dependency: bool = False - ) -> dict: - return self._get_table_task(node, follow_external_dependency) + return self._get_athena_table_task(node, follow_external_dependency) def _get_model_data_location( self, node: dict, schema: str, model_name: str @@ -286,7 +277,6 @@ def __init__(self, default_config_parameters: dict): super().__init__(default_config_parameters) self._profile_name = "databricks" self._default_catalog = self._target_config.get("catalog") - self._athena_dbt_parser = AthenaDBTConfigParser(default_config_parameters) self._create_external_athena_table = default_config_parameters.get( "create_external_athena_table", False ) @@ -310,11 +300,6 @@ def _get_table_task( return task - def _get_athena_table_task( - self, node: dict, follow_external_dependency: bool = False - ) -> dict: - return self._athena_dbt_parser._get_table_task(node, follow_external_dependency) - def _get_model_data_location( self, node: dict, schema: str, model_name: str ) -> Tuple[str, str]: From 5659c689bf76ce1082079da075532e5dd7ecbf51 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 26 Apr 2024 17:56:33 +0200 Subject: [PATCH 077/134] feat: adjust the s3 tasks --- dagger/utilities/dbt_config_parser.py | 61 ++++++------------- .../dbt_config_parser_fixtures_athena.py | 8 +-- .../dbt_config_parser_fixtures_databricks.py | 8 +-- 3 files changed, 27 insertions(+), 50 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 8f761e3..0973444 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -87,10 +87,20 @@ def _get_model_data_location( """Get the S3 path of the DBT model relative to the data bucket. Must be implemented by subclasses.""" pass - @abstractmethod - def _get_s3_task(self, node: dict) -> dict: - """Generate an S3 task configuration based on a DBT node. Must be implemented by subclasses.""" - pass + def _get_s3_task(self, node: dict, is_output: bool = False) -> dict: + """ + Generates the dagger s3 task for the databricks-dbt model node + """ + task = S3_TASK_BASE.copy() + + schema = node.get("schema", self._default_schema) + table = node.get("name", "") + task["name"] = f"output_s3_path" if is_output else f"s3_{table}" + task["bucket"], task["path"] = self._get_model_data_location( + node, schema, table + ) + + return task @staticmethod def _get_dummy_task(node: dict, follow_external_dependency: bool = False) -> dict: @@ -230,25 +240,6 @@ def _get_model_data_location( return bucket_name, data_path - def _get_s3_task(self, node: dict) -> dict: - """ - Generates the dagger s3 task for the DBT model node - Args: - node: The extracted node from the manifest.json file - - Returns: - dict: The dagger s3 task for the DBT model node - - """ - task = S3_TASK_BASE.copy() - - schema = node.get("schema", self._default_schema) - table = node.get("name", "") - task["name"] = f"{schema}__{table}_s3" - task["bucket"], task["path"] = self._get_model_data_location( - node, schema, table - ) - return task def _generate_dagger_output(self, node: dict): """ @@ -267,7 +258,7 @@ def _generate_dagger_output(self, node: dict): ) or node.get("name").startswith("stg_"): return [self._get_dummy_task(node)] else: - return [self._get_table_task(node), self._get_s3_task(node)] + return [self._get_table_task(node), self._get_s3_task(node, is_output=True)] class DatabricksDBTConfigParser(DBTConfigParser): @@ -313,22 +304,6 @@ def _get_model_data_location( return bucket_name, data_path - def _get_s3_task(self, node: dict) -> dict: - """ - Generates the dagger s3 task for the databricks-dbt model node - """ - task = S3_TASK_BASE.copy() - - catalog = node.get("database", self._default_catalog) - schema = node.get("schema", self._default_schema) - table = node.get("name", "") - task["name"] = f"{catalog}__{schema}__{table}_s3" - task["bucket"], task["path"] = self._get_model_data_location( - node, schema, table - ) - - return task - def _generate_dagger_output(self, node: dict): """ Generates the dagger output for the DBT model node with the databricks-dbt adapter. @@ -345,10 +320,12 @@ def _generate_dagger_output(self, node: dict): if node.get("config", {}).get("materialized") in ( "view", "ephemeral", - ) or node.get("name").startswith("stg_"): + ) or node.get("name").startswith("stg_") or "preparation" in "preparation" in node.get( + "schema", "" + ): return [self._get_dummy_task(node)] else: - output_tasks = [self._get_table_task(node), self._get_s3_task(node)] + output_tasks = [self._get_table_task(node), self._get_s3_task(node, is_output=True)] if self._create_external_athena_table: output_tasks.append(self._get_athena_table_task(node)) return output_tasks diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py index 5f44af4..072fb41 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py @@ -226,7 +226,7 @@ }, { "bucket": "bucket1-data-lake", - "name": "analytics_engineering__model2_s3", + "name": "s3_model2", "path": "path2/model2", "type": "s3", }, @@ -269,7 +269,7 @@ }, { "bucket": "bucket1-data-lake", - "name": "analytics_engineering__model1_s3", + "name": "s3_model1", "path": "path1/model1", "type": "s3", }, @@ -290,7 +290,7 @@ }, { "bucket": "bucket1-data-lake", - "name": "analytics_engineering__model2_s3", + "name": "s3_model2", "path": "path2/model2", "type": "s3", }, @@ -339,7 +339,7 @@ }, { "bucket": "bucket1-data-lake", - "name": "analytics_engineering__model1_s3", + "name": "output_s3_path", "path": "path1/model1", "type": "s3", }, diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py index 94a387f..b415c60 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py @@ -237,7 +237,7 @@ }, { "bucket": "chodata-data-lake", - "name": "marts__analytics_engineering__model2_s3", + "name": "s3_model2", "path": "analytics_warehouse/data/marts/analytics_engineering/model2", "type": "s3", }, @@ -281,7 +281,7 @@ }, { "bucket": "chodata-data-lake", - "name": "marts__analytics_engineering__model1_s3", + "name": "s3_model1", "path": "analytics_warehouse/data/marts/analytics_engineering/model1", "type": "s3", }, @@ -303,7 +303,7 @@ }, { "bucket": "chodata-data-lake", - "name": "marts__analytics_engineering__model2_s3", + "name": "s3_model2", "path": "analytics_warehouse/data/marts/analytics_engineering/model2", "type": "s3", }, @@ -362,7 +362,7 @@ }, { "bucket": "chodata-data-lake", - "name": "marts__analytics_engineering__model1_s3", + "name": "output_s3_path", "path": "analytics_warehouse/data/marts/analytics_engineering/model1", "type": "s3", }, From 81697d48d6030695f1e8a29f410ff778928965b4 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 29 Apr 2024 11:55:53 +0200 Subject: [PATCH 078/134] feat: adjust the _get_s3_task for different dbt adapters --- dagger/utilities/dbt_config_parser.py | 44 ++++++++++++++++++++------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 0973444..00f58c1 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -87,20 +87,12 @@ def _get_model_data_location( """Get the S3 path of the DBT model relative to the data bucket. Must be implemented by subclasses.""" pass + @abstractmethod def _get_s3_task(self, node: dict, is_output: bool = False) -> dict: """ - Generates the dagger s3 task for the databricks-dbt model node + Generate an S3 task for a DBT node for the specific dbt-adapter. Must be implemented by subclasses. """ - task = S3_TASK_BASE.copy() - - schema = node.get("schema", self._default_schema) - table = node.get("name", "") - task["name"] = f"output_s3_path" if is_output else f"s3_{table}" - task["bucket"], task["path"] = self._get_model_data_location( - node, schema, table - ) - - return task + pass @staticmethod def _get_dummy_task(node: dict, follow_external_dependency: bool = False) -> dict: @@ -240,6 +232,21 @@ def _get_model_data_location( return bucket_name, data_path + def _get_s3_task(self, node: dict, is_output: bool = False) -> dict: + """ + Generates the dagger s3 task for the athena-dbt model node + """ + task = S3_TASK_BASE.copy() + + schema = node.get("schema", self._default_schema) + table = node.get("name", "") + task["name"] = f"output_s3_path" if is_output else f"s3_{table}" + task["bucket"] = self._default_data_bucket + _, task["path"] = self._get_model_data_location( + node, schema, table + ) + + return task def _generate_dagger_output(self, node: dict): """ @@ -304,6 +311,21 @@ def _get_model_data_location( return bucket_name, data_path + def _get_s3_task(self, node: dict, is_output: bool = False) -> dict: + """ + Generates the dagger s3 task for the databricks-dbt model node + """ + task = S3_TASK_BASE.copy() + + schema = node.get("schema", self._default_schema) + table = node.get("name", "") + task["name"] = f"output_s3_path" if is_output else f"s3_{table}" + task["bucket"], task["path"] = self._get_model_data_location( + node, schema, table + ) + + return task + def _generate_dagger_output(self, node: dict): """ Generates the dagger output for the DBT model node with the databricks-dbt adapter. From 7c1aa22917786068a85eec953e97078ca656501a Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 29 Apr 2024 14:09:12 +0200 Subject: [PATCH 079/134] fix: define the correct target_config for databricks adapter --- dagger/utilities/dbt_config_parser.py | 30 ++++++++++++++++----------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 00f58c1..757ad1a 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -26,9 +26,11 @@ def __init__(self, config_parameters: dict): self._get_manifest_path(), file_type="json" ) profile_data = self._load_file(self._get_profile_path(), file_type="yaml") - self._target_config = profile_data[self._profile_name]["outputs"][ - self._target_name - ] + self._target_config = ( + profile_data[self._profile_name]["outputs"].get(self._target_name) + if self._profile_name == "athena" + else profile_data[self._profile_name]["outputs"]["data"] + ) # if databricks, get the default catalog and schema from the data output self._default_schema = self._target_config.get("schema", "") self._nodes_in_manifest = self._manifest_data.get("nodes", {}) self._sources_in_manifest = self._manifest_data.get("sources", {}) @@ -242,9 +244,7 @@ def _get_s3_task(self, node: dict, is_output: bool = False) -> dict: table = node.get("name", "") task["name"] = f"output_s3_path" if is_output else f"s3_{table}" task["bucket"] = self._default_data_bucket - _, task["path"] = self._get_model_data_location( - node, schema, table - ) + _, task["path"] = self._get_model_data_location(node, schema, table) return task @@ -339,15 +339,21 @@ def _generate_dagger_output(self, node: dict): dict: The dagger output, which is a combination of an athena and s3 task for the DBT model node """ - if node.get("config", {}).get("materialized") in ( - "view", - "ephemeral", - ) or node.get("name").startswith("stg_") or "preparation" in "preparation" in node.get( - "schema", "" + if ( + node.get("config", {}).get("materialized") + in ( + "view", + "ephemeral", + ) + or node.get("name").startswith("stg_") + or "preparation" in "preparation" in node.get("schema", "") ): return [self._get_dummy_task(node)] else: - output_tasks = [self._get_table_task(node), self._get_s3_task(node, is_output=True)] + output_tasks = [ + self._get_table_task(node), + self._get_s3_task(node, is_output=True), + ] if self._create_external_athena_table: output_tasks.append(self._get_athena_table_task(node)) return output_tasks From 40b28ef9298d5151fd73e894606a71bc18409493 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 29 Apr 2024 14:18:12 +0200 Subject: [PATCH 080/134] fix: _generate_dagger_output --- dagger/utilities/dbt_config_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 757ad1a..b801379 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -346,7 +346,7 @@ def _generate_dagger_output(self, node: dict): "ephemeral", ) or node.get("name").startswith("stg_") - or "preparation" in "preparation" in node.get("schema", "") + or "preparation" in node.get("schema", "") ): return [self._get_dummy_task(node)] else: From 9c0311f2d1daee5d210c26ea5dadd226921feb62 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 29 Apr 2024 23:12:31 +0200 Subject: [PATCH 081/134] extend: command --- dagger/dag_creator/airflow/operator_creators/dbt_creator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py index 60866c8..c4e250a 100644 --- a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py @@ -33,5 +33,6 @@ def _generate_command(self): command.append(f"--vars='{dbt_vars}'") if self._create_external_athena_table: command.append(f"--create_external_athena_table={self._create_external_athena_table}") + command.append(super()._generate_command()) return command From 14f9d1f0dcf81328cb0aee47b1644f5f4e6643b8 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Tue, 30 Apr 2024 09:47:12 +0200 Subject: [PATCH 082/134] fix: _generate_command --- dagger/dag_creator/airflow/operator_creators/dbt_creator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py index c4e250a..9be9ee8 100644 --- a/dagger/dag_creator/airflow/operator_creators/dbt_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/dbt_creator.py @@ -33,6 +33,8 @@ def _generate_command(self): command.append(f"--vars='{dbt_vars}'") if self._create_external_athena_table: command.append(f"--create_external_athena_table={self._create_external_athena_table}") - command.append(super()._generate_command()) - + for param_name, param_value in self._template_parameters.items(): + command.append( + f"--{param_name}={param_value}" + ) return command From f7cc9830f97d210a21b74f87c87957b8f16799a1 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Tue, 30 Apr 2024 17:12:09 +0200 Subject: [PATCH 083/134] fix: _get_model_data_location in DatabricksDBTConfigParser --- dagger/utilities/dbt_config_parser.py | 2 +- .../modules/dbt_config_parser_fixtures_databricks.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index b801379..1b64132 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -305,7 +305,7 @@ def _get_model_data_location( Gets the S3 path of the dbt model relative to the data bucket. """ location_root = node.get("config", {}).get("location_root") - location = join(location_root, schema, model_name) + location = join(location_root, model_name) split = location.split("//")[1].split("/") bucket_name, data_path = split[0], "/".join(split[1:]) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py index b415c60..232fe8c 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py @@ -22,7 +22,7 @@ "unique_id": "model.main.model1", "resource_type": "model", "config": { - "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts", + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts/analytics_engineering", "materialized": "incremental", "incremental_strategy": "insert_overwrite", }, @@ -30,7 +30,7 @@ "tags": ["daily"], "unrendered_config": { "materialized": "incremental", - "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts", + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts/analytics_engineering", "incremental_strategy": "insert_overwrite", "partitioned_by": ["year", "month", "day", "dt"], "tags": ["daily"], @@ -90,7 +90,7 @@ "unique_id": "model.main.model2", "resource_type": "model", "config": { - "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts", + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts/analytics_engineering", "materialized": "table", }, "depends_on": {"macros": [], "nodes": []}, @@ -103,7 +103,7 @@ "resource_type": "model", "config": { "materialized": "ephemeral", - "location_root": "s3://chodata-data-lake/analytics_warehouse/data/intermediate", + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/intermediate/analytics_engineering", }, "depends_on": { "macros": [], @@ -128,7 +128,7 @@ "schema": "analytics_engineering", "unique_id": "model.main.model3", "config": { - "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts", + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/marts/analytics_engineering", }, "depends_on": { "macros": [], @@ -147,7 +147,7 @@ "schema": "analytics_engineering", "config": { "materialized": "ephemeral", - "location_root": "s3://chodata-data-lake/analytics_warehouse/data/intermediate", + "location_root": "s3://chodata-data-lake/analytics_warehouse/data/intermediate/analytics_engineering", }, "depends_on": { "macros": [], From 59cf8473a738c7ea57e72702f80a1997d83b909d Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 3 May 2024 15:42:18 +0200 Subject: [PATCH 084/134] fix: generate_task for dbt tasks --- dagger/utilities/module.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 6b6aa86..968e196 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -28,12 +28,6 @@ def __init__(self, path_to_config, target_dir): self._override_parameters = config.get("override_parameters", {}) self._default_parameters = config.get("default_parameters", {}) - if "dbt" in self._tasks.keys(): - if self._default_parameters.get("profile_name") == "athena": - self._dbt_module = AthenaDBTConfigParser(self._default_parameters) - if self._default_parameters.get("profile_name") == "databricks": - self._dbt_module = DatabricksDBTConfigParser(self._default_parameters) - @staticmethod def read_yaml(yaml_str): try: @@ -85,6 +79,11 @@ def generate_task_configs(self): template_parameters = {} template_parameters.update(self._default_parameters or {}) template_parameters.update(attrs) + if "dbt" in self._tasks.keys(): + if template_parameters.get("profile_name") == "athena": + self._dbt_module = AthenaDBTConfigParser(template_parameters) + if template_parameters.get("profile_name") == "databricks": + self._dbt_module = DatabricksDBTConfigParser(template_parameters) for task, task_yaml in self._tasks.items(): task_name = f"{branch_name}_{task}" From c44a7b5ccb83e7e53810766d1a0b57c5f50d78c4 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Wed, 12 Jun 2024 11:37:16 +0200 Subject: [PATCH 085/134] Replacing string replacement with jinja in module processor --- dagger/utilities/dbt_config_parser.py | 12 ++++++++++++ dagger/utilities/module.py | 28 ++++++++++++++++----------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 1b64132..3be57fe 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -35,6 +35,18 @@ def __init__(self, config_parameters: dict): self._nodes_in_manifest = self._manifest_data.get("nodes", {}) self._sources_in_manifest = self._manifest_data.get("sources", {}) + @property + def nodes_in_manifest(self): + return self._nodes_in_manifest + + @property + def sources_in_manifest(self): + return self._sources_in_manifest + + @property + def dbt_default_schema(self): + return self._default_schema + def _get_manifest_path(self) -> str: """ Construct path for manifest.json file based on configuration parameters. diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 968e196..ff1329f 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -6,6 +6,8 @@ DatabricksDBTConfigParser, ) +import jinja2 + import yaml _logger = logging.getLogger("root") @@ -48,19 +50,13 @@ def read_task_config(self, task): @staticmethod def replace_template_parameters(_task_str, _template_parameters): - for _key, _value in _template_parameters.items(): - if type(_value) == str: - try: - int_value = int(_value) - _value = f'"{_value}"' - except: - pass - locals()[_key] = _value + environment = jinja2.Environment() + template = environment.from_string(_task_str) + rendered_task = template.render(_template_parameters) return ( - _task_str.format(**locals()) - .replace("{", "{{") - .replace("}", "}}") + rendered_task + # TODO Remove this hack and use Jinja escaping instead of special expression in template files .replace("__CBS__", "{") .replace("__CBE__", "}") ) @@ -79,12 +75,22 @@ def generate_task_configs(self): template_parameters = {} template_parameters.update(self._default_parameters or {}) template_parameters.update(attrs) + template_parameters['branch_name'] = branch_name + + dbt_manifest = None if "dbt" in self._tasks.keys(): if template_parameters.get("profile_name") == "athena": self._dbt_module = AthenaDBTConfigParser(template_parameters) if template_parameters.get("profile_name") == "databricks": self._dbt_module = DatabricksDBTConfigParser(template_parameters) + dbt_manifest = {} + dbt_manifest['nodes'] = self._dbt_module.nodes_in_manifest + dbt_manifest['sources'] = self._dbt_module.sources_in_manifest + + template_parameters["dbt_manifest"] = dbt_manifest + template_parameters["dbt_default_schema"] = self._dbt_module.dbt_default_schema + for task, task_yaml in self._tasks.items(): task_name = f"{branch_name}_{task}" _logger.info(f"Generating task {task_name}") From 1bc6bf28e2286f78e3b30902a7e60a63f24a182f Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 17 Jun 2024 12:30:37 +0200 Subject: [PATCH 086/134] feat: adjust the dbt config parser so that view/ephemeral staging layer doesnt need a task --- dagger/utilities/dbt_config_parser.py | 8 +++-- .../dbt_config_parser_fixtures_athena.py | 36 ++++++++++++++++--- .../dbt_config_parser_fixtures_databricks.py | 32 +++++++++++++++-- tests/utilities/test_dbt_config_parser.py | 5 --- 4 files changed, 67 insertions(+), 14 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 3be57fe..bb79b39 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -161,16 +161,18 @@ def _generate_dagger_tasks(self, node_name: str) -> List[Dict]: node, follow_external_dependency=True ) dagger_tasks.append(table_task) - elif node.get("config", {}).get("materialized") == "ephemeral": + elif node.get("config", {}).get("materialized") == "ephemeral" or ((node.get("name").startswith("stg_") or "preparation" in node.get( + "schema", "" + )) and node.get("config", {}).get("materialized") != "table"): task = self._get_dummy_task(node, follow_external_dependency=True) dagger_tasks.append(task) ephemeral_parent_node_names = node.get("depends_on", {}).get("nodes", []) for node_name in ephemeral_parent_node_names: dagger_tasks += self._generate_dagger_tasks(node_name) - elif node.get("name").startswith("stg_") or "preparation" in node.get( + elif (node.get("name").startswith("stg_") or "preparation" in node.get( "schema", "" - ): + ) and node.get("config", {}).get("materialized") == "table"): dagger_tasks.append( self._get_dummy_task(node, follow_external_dependency=True) ) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py index 072fb41..2f1c6ee 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py @@ -58,7 +58,7 @@ "unique_id": "model.main.stg_core_schema1__table1", "name": "stg_core_schema1__table1", "config": { - "materialized": "view", + "materialized": "table", }, "depends_on": { "macros": [], @@ -235,6 +235,20 @@ "type": "dummy", "follow_external_dependency": True, }, + { + "type": "athena", + "name": "core_schema2__table2_athena", + "schema": "core_schema2", + "table": "table2", + "follow_external_dependency": True, + }, + { + "type": "athena", + "name": "core_schema2__table3_athena", + "schema": "core_schema2", + "table": "table3", + "follow_external_dependency": True, + }, ] EXPECTED_EPHEMERAL_NODE = [ @@ -256,7 +270,7 @@ "name": "stg_core_schema1__table1", "type": "dummy", "follow_external_dependency": True, - }, + } ] EXPECTED_MODEL_NODE = [ @@ -281,6 +295,21 @@ "type": "dummy", "follow_external_dependency": True, }, + { + "type": "athena", + "name": "core_schema2__table2_athena", + "schema": "core_schema2", + "table": "table2", + "follow_external_dependency": True, + }, + { + "type": "athena", + "name": "core_schema2__table3_athena", + "schema": "core_schema2", + "table": "table3", + "follow_external_dependency": True, + }, + {"name": "seed_buyer_country_overwrite", "type": "dummy"}, { "name": "analytics_engineering__model2_athena", "schema": "analytics_engineering", @@ -304,12 +333,11 @@ "name": "int_model2", "follow_external_dependency": True, }, - {"name": "seed_buyer_country_overwrite", "type": "dummy"}, { "name": "stg_core_schema1__table1", "type": "dummy", "follow_external_dependency": True, - }, + } ] EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS = [ diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py index 232fe8c..342c32a 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py @@ -57,7 +57,7 @@ "resource_type": "model", "config": { "location_root": "s3://chodata-data-lake/analytics_warehouse/data/preparation", - "materialized": "view", + "materialized": "table", }, "depends_on": { "macros": [], @@ -246,6 +246,20 @@ "type": "dummy", "follow_external_dependency": True, }, + { + "type": "athena", + "name": "core_schema2__table2_athena", + "schema": "core_schema2", + "table": "table2", + "follow_external_dependency": True, + }, + { + "type": "athena", + "name": "core_schema2__table3_athena", + "schema": "core_schema2", + "table": "table3", + "follow_external_dependency": True, + }, ] DATABRICKS_EXPECTED_EPHEMERAL_NODE = [ @@ -293,6 +307,21 @@ "type": "dummy", "follow_external_dependency": True, }, + { + "type": "athena", + "name": "core_schema2__table2_athena", + "schema": "core_schema2", + "table": "table2", + "follow_external_dependency": True, + }, + { + "type": "athena", + "name": "core_schema2__table3_athena", + "schema": "core_schema2", + "table": "table3", + "follow_external_dependency": True, + }, + {"name": "seed_buyer_country_overwrite", "type": "dummy"}, { "name": "marts__analytics_engineering__model2_databricks", "catalog": "marts", @@ -317,7 +346,6 @@ "name": "int_model2", "follow_external_dependency": True, }, - {"name": "seed_buyer_country_overwrite", "type": "dummy"}, { "name": "stg_core_schema1__table1", "type": "dummy", diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index 9e4d18f..d401e4b 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -49,10 +49,6 @@ def test_generate_task_configs(self): def test_generate_dagger_tasks(self): test_inputs = [ - ( - "model.main.stg_core_schema1__table1", - EXPECTED_STAGING_NODE, - ), ( "seed.main.seed_buyer_country_overwrite", EXPECTED_SEED_NODE, @@ -159,7 +155,6 @@ def test_generate_io_inputs(self): ] for mock_input, expected_output in fixtures: result, _ = self._dbt_config_parser.generate_dagger_io(mock_input) - self.assertListEqual(result, expected_output) def test_generate_io_outputs(self): From 22ca09b45ab35072897ba2023420806b0f43f387 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Tue, 18 Jun 2024 17:10:49 +0200 Subject: [PATCH 087/134] feat: adjust io for the materalised staging model --- dagger/utilities/dbt_config_parser.py | 16 ++--- .../dbt_config_parser_fixtures_athena.py | 65 +++++++++++++++---- .../dbt_config_parser_fixtures_databricks.py | 65 ++++++++++++++++--- 3 files changed, 115 insertions(+), 31 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index bb79b39..8a62fb4 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -161,21 +161,19 @@ def _generate_dagger_tasks(self, node_name: str) -> List[Dict]: node, follow_external_dependency=True ) dagger_tasks.append(table_task) - elif node.get("config", {}).get("materialized") == "ephemeral" or ((node.get("name").startswith("stg_") or "preparation" in node.get( - "schema", "" - )) and node.get("config", {}).get("materialized") != "table"): + elif node.get("config", {}).get("materialized") == "ephemeral" or ( + ( + node.get("name").startswith("stg_") + or "preparation" in node.get("schema", "") + ) + and node.get("config", {}).get("materialized") != "table" + ): task = self._get_dummy_task(node, follow_external_dependency=True) dagger_tasks.append(task) ephemeral_parent_node_names = node.get("depends_on", {}).get("nodes", []) for node_name in ephemeral_parent_node_names: dagger_tasks += self._generate_dagger_tasks(node_name) - elif (node.get("name").startswith("stg_") or "preparation" in node.get( - "schema", "" - ) and node.get("config", {}).get("materialized") == "table"): - dagger_tasks.append( - self._get_dummy_task(node, follow_external_dependency=True) - ) else: table_task = self._get_table_task(node, follow_external_dependency=True) s3_task = self._get_s3_task(node) diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py index 2f1c6ee..f1afd52 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py @@ -59,6 +59,7 @@ "name": "stg_core_schema1__table1", "config": { "materialized": "table", + "external_location": "s3://bucket1-data-lake/path2/stg_core_schema1__table1", }, "depends_on": { "macros": [], @@ -184,10 +185,18 @@ EXPECTED_STAGING_NODE = [ { - "name": "stg_core_schema1__table1", - "type": "dummy", + "name": "analytics_engineering__stg_core_schema1__table1_athena", + "type": "athena", + "table": "stg_core_schema1__table1", + "schema": "analytics_engineering", "follow_external_dependency": True, }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "bucket1-data-lake", + "path": "path2/stg_core_schema1__table1", + }, ] EXPECTED_SEED_NODE = [ @@ -213,10 +222,18 @@ "name": "seed_buyer_country_overwrite", }, { - "name": "stg_core_schema1__table1", - "type": "dummy", + "name": "analytics_engineering__stg_core_schema1__table1_athena", + "type": "athena", + "table": "stg_core_schema1__table1", + "schema": "analytics_engineering", "follow_external_dependency": True, }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "bucket1-data-lake", + "path": "path2/stg_core_schema1__table1", + }, { "type": "athena", "name": "analytics_engineering__model2_athena", @@ -267,10 +284,18 @@ "name": "seed_buyer_country_overwrite", }, { - "name": "stg_core_schema1__table1", - "type": "dummy", + "name": "analytics_engineering__stg_core_schema1__table1_athena", + "type": "athena", + "table": "stg_core_schema1__table1", + "schema": "analytics_engineering", "follow_external_dependency": True, - } + }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "bucket1-data-lake", + "path": "path2/stg_core_schema1__table1", + }, ] EXPECTED_MODEL_NODE = [ @@ -334,10 +359,18 @@ "follow_external_dependency": True, }, { - "name": "stg_core_schema1__table1", - "type": "dummy", + "name": "analytics_engineering__stg_core_schema1__table1_athena", + "type": "athena", + "table": "stg_core_schema1__table1", + "schema": "analytics_engineering", "follow_external_dependency": True, - } + }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "bucket1-data-lake", + "path": "path2/stg_core_schema1__table1", + }, ] EXPECTED_DBT_STAGING_MODEL_DAGGER_INPUTS = [ @@ -383,8 +416,16 @@ EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS = [ {"name": "seed_buyer_country_overwrite", "type": "dummy"}, { - "name": "stg_core_schema1__table1", - "type": "dummy", + "name": "analytics_engineering__stg_core_schema1__table1_athena", + "type": "athena", + "table": "stg_core_schema1__table1", + "schema": "analytics_engineering", "follow_external_dependency": True, }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "bucket1-data-lake", + "path": "path2/stg_core_schema1__table1", + }, ] diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py index 342c32a..2538e25 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py @@ -194,9 +194,18 @@ DATABRICKS_EXPECTED_STAGING_NODE = [ { - "name": "stg_core_schema1__table1", - "type": "dummy", + "type": "databricks", "follow_external_dependency": True, + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "stg_core_schema1__table1", + "name": "hive_metastore__data_preparation__stg_core_schema1__table1_databricks", + }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "chodata-data-lake", + "path": "analytics_warehouse/data/preparation/stg_core_schema1__table1", }, ] @@ -223,9 +232,18 @@ "name": "seed_buyer_country_overwrite", }, { - "name": "stg_core_schema1__table1", - "type": "dummy", + "type": "databricks", "follow_external_dependency": True, + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "stg_core_schema1__table1", + "name": "hive_metastore__data_preparation__stg_core_schema1__table1_databricks", + }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "chodata-data-lake", + "path": "analytics_warehouse/data/preparation/stg_core_schema1__table1", }, { "type": "databricks", @@ -278,9 +296,18 @@ "name": "seed_buyer_country_overwrite", }, { - "name": "stg_core_schema1__table1", - "type": "dummy", + "type": "databricks", "follow_external_dependency": True, + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "stg_core_schema1__table1", + "name": "hive_metastore__data_preparation__stg_core_schema1__table1_databricks", + }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "chodata-data-lake", + "path": "analytics_warehouse/data/preparation/stg_core_schema1__table1", }, ] @@ -347,9 +374,18 @@ "follow_external_dependency": True, }, { - "name": "stg_core_schema1__table1", - "type": "dummy", + "type": "databricks", "follow_external_dependency": True, + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "stg_core_schema1__table1", + "name": "hive_metastore__data_preparation__stg_core_schema1__table1_databricks", + }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "chodata-data-lake", + "path": "analytics_warehouse/data/preparation/stg_core_schema1__table1", }, ] @@ -374,9 +410,18 @@ DATABRICKS_EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS = [ {"name": "seed_buyer_country_overwrite", "type": "dummy"}, { - "name": "stg_core_schema1__table1", - "type": "dummy", + "type": "databricks", "follow_external_dependency": True, + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "stg_core_schema1__table1", + "name": "hive_metastore__data_preparation__stg_core_schema1__table1_databricks", + }, + { + "type": "s3", + "name": "s3_stg_core_schema1__table1", + "bucket": "chodata-data-lake", + "path": "analytics_warehouse/data/preparation/stg_core_schema1__table1", }, ] From f2f8015d94bbda0681a6bbb07baafaf8078d8824 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Wed, 19 Jun 2024 14:45:55 +0200 Subject: [PATCH 088/134] feat: restructure the dbt dagger task input & output --- dagger/utilities/dbt_config_parser.py | 99 +++++++++++-------- .../dbt_config_parser_fixtures_athena.py | 71 ++++++++----- .../dbt_config_parser_fixtures_databricks.py | 90 ++++++++++++----- tests/utilities/test_dbt_config_parser.py | 8 +- 4 files changed, 173 insertions(+), 95 deletions(-) diff --git a/dagger/utilities/dbt_config_parser.py b/dagger/utilities/dbt_config_parser.py index 8a62fb4..9a341f6 100644 --- a/dagger/utilities/dbt_config_parser.py +++ b/dagger/utilities/dbt_config_parser.py @@ -133,18 +133,25 @@ def _generate_dagger_output(self, node: dict): """Generate the dagger output for a DBT node. Must be implemented by subclasses.""" pass + @abstractmethod + def _is_node_preparation_model(self, node: dict): + """Define whether it is a preparation model. Must be implemented by subclasses.""" + pass + def _generate_dagger_tasks(self, node_name: str) -> List[Dict]: """ - Generates the dagger task based on whether the DBT model node is a staging model or not. - If the DBT model node represents a DBT seed or an ephemeral model, then a dagger dummy task is generated. - If the DBT model node represents a staging model, then a dagger athena task is generated for each source of the DBT model. Apart from this, a dummy task is also generated for the staging model itself. - If the DBT model node is not a staging model, then a dagger athena task and an s3 task is generated for the DBT model node itself. + Generates dagger tasks based on the type and materialization of the DBT model node. + + - If the node is a DBT source, an Athena table task is generated. + - If the node is an ephemeral model, a dummy task is generated, and tasks for its dependent nodes are recursively generated. + - If the node is a staging model (preparation model) and not materialized as a table, a table task is generated along with tasks for its dependent nodes. + - For other nodes, a table task is generated. If the node is materialized as a table, an additional S3 task is also generated. + Args: node_name: The name of the DBT model node Returns: List[Dict]: The respective dagger tasks for the DBT model node - """ dagger_tasks = [] @@ -153,33 +160,36 @@ def _generate_dagger_tasks(self, node_name: str) -> List[Dict]: else: node = self._nodes_in_manifest[node_name] - if node.get("resource_type") == "seed": - task = self._get_dummy_task(node) - dagger_tasks.append(task) - elif node.get("resource_type") == "source": + resource_type = node.get("resource_type") + materialized_type = node.get("config", {}).get("materialized") + + follow_external_dependency = True + if resource_type == "seed" or (self._is_node_preparation_model(node) and materialized_type != "table"): + follow_external_dependency = False + + if resource_type == "source": table_task = self._get_athena_table_task( - node, follow_external_dependency=True + node, follow_external_dependency=follow_external_dependency ) dagger_tasks.append(table_task) - elif node.get("config", {}).get("materialized") == "ephemeral" or ( - ( - node.get("name").startswith("stg_") - or "preparation" in node.get("schema", "") - ) - and node.get("config", {}).get("materialized") != "table" - ): - task = self._get_dummy_task(node, follow_external_dependency=True) - dagger_tasks.append(task) - ephemeral_parent_node_names = node.get("depends_on", {}).get("nodes", []) - for node_name in ephemeral_parent_node_names: + elif materialized_type == "ephemeral": + task = self._get_dummy_task(node) + dagger_tasks.append(task) + for node_name in node.get("depends_on", {}).get("nodes", []): dagger_tasks += self._generate_dagger_tasks(node_name) - else: - table_task = self._get_table_task(node, follow_external_dependency=True) - s3_task = self._get_s3_task(node) + else: + table_task = self._get_table_task(node, follow_external_dependency=follow_external_dependency) dagger_tasks.append(table_task) - dagger_tasks.append(s3_task) + + if materialized_type in ("table", "incremental"): + dagger_tasks.append(self._get_s3_task(node)) + elif self._is_node_preparation_model(node): + for dependent_node_name in node.get("depends_on", {}).get("nodes", []): + dagger_tasks.extend( + self._generate_dagger_tasks(dependent_node_name) + ) return dagger_tasks @@ -223,6 +233,10 @@ def __init__(self, default_config_parameters: dict): "s3_data_dir" ) or self._target_config.get("s3_staging_dir") + def _is_node_preparation_model(self, node: dict): + """Define whether it is a preparation model.""" + return node.get("name").startswith("stg_") + def _get_table_task( self, node: dict, follow_external_dependency: bool = False ) -> dict: @@ -271,13 +285,14 @@ def _generate_dagger_output(self, node: dict): dict: The dagger output, which is a combination of an athena and s3 task for the DBT model node """ - if node.get("config", {}).get("materialized") in ( - "view", - "ephemeral", - ) or node.get("name").startswith("stg_"): + materialized_type = node.get("config", {}).get("materialized") + if materialized_type == "ephemeral": return [self._get_dummy_task(node)] else: - return [self._get_table_task(node), self._get_s3_task(node, is_output=True)] + output_tasks = [self._get_table_task(node)] + if materialized_type in ("table", "incremental"): + output_tasks.append(self._get_s3_task(node, is_output=True)) + return output_tasks class DatabricksDBTConfigParser(DBTConfigParser): @@ -291,6 +306,12 @@ def __init__(self, default_config_parameters: dict): "create_external_athena_table", False ) + def _is_node_preparation_model(self, node: dict): + """ + Define whether it is a preparation model. + """ + return "preparation" in node.get("schema", "") + def _get_table_task( self, node: dict, follow_external_dependency: bool = False ) -> dict: @@ -351,21 +372,13 @@ def _generate_dagger_output(self, node: dict): dict: The dagger output, which is a combination of an athena and s3 task for the DBT model node """ - if ( - node.get("config", {}).get("materialized") - in ( - "view", - "ephemeral", - ) - or node.get("name").startswith("stg_") - or "preparation" in node.get("schema", "") - ): + materialized_type = node.get("config", {}).get("materialized") + if materialized_type == "ephemeral": return [self._get_dummy_task(node)] else: - output_tasks = [ - self._get_table_task(node), - self._get_s3_task(node, is_output=True), - ] + output_tasks = [self._get_table_task(node)] + if materialized_type in ("table", "incremental"): + output_tasks.append(self._get_s3_task(node, is_output=True)) if self._create_external_athena_table: output_tasks.append(self._get_athena_table_task(node)) return output_tasks diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py index f1afd52..64fffce 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_athena.py @@ -201,8 +201,10 @@ EXPECTED_SEED_NODE = [ { - "type": "dummy", - "name": "seed_buyer_country_overwrite", + "type": "athena", + "schema": "analytics_engineering", + "table": "seed_buyer_country_overwrite", + "name": "analytics_engineering__seed_buyer_country_overwrite_athena", } ] @@ -210,16 +212,16 @@ { "type": "dummy", "name": "int_model3", - "follow_external_dependency": True, }, { "type": "dummy", "name": "int_model2", - "follow_external_dependency": True, }, { - "type": "dummy", - "name": "seed_buyer_country_overwrite", + "type": "athena", + "schema": "analytics_engineering", + "table": "seed_buyer_country_overwrite", + "name": "analytics_engineering__seed_buyer_country_overwrite_athena", }, { "name": "analytics_engineering__stg_core_schema1__table1_athena", @@ -248,9 +250,10 @@ "type": "s3", }, { - "name": "stg_core_schema2__table2", - "type": "dummy", - "follow_external_dependency": True, + "type": "athena", + "schema": "analytics_engineering", + "table": "stg_core_schema2__table2", + "name": "analytics_engineering__stg_core_schema2__table2_athena", }, { "type": "athena", @@ -272,16 +275,16 @@ { "type": "dummy", "name": "int_model3", - "follow_external_dependency": True, }, { "type": "dummy", "name": "int_model2", - "follow_external_dependency": True, }, { - "type": "dummy", - "name": "seed_buyer_country_overwrite", + "type": "athena", + "schema": "analytics_engineering", + "table": "seed_buyer_country_overwrite", + "name": "analytics_engineering__seed_buyer_country_overwrite_athena", }, { "name": "analytics_engineering__stg_core_schema1__table1_athena", @@ -316,9 +319,10 @@ EXPECTED_DAGGER_INPUTS = [ { - "name": "stg_core_schema2__table2", - "type": "dummy", - "follow_external_dependency": True, + "type": "athena", + "schema": "analytics_engineering", + "table": "stg_core_schema2__table2", + "name": "analytics_engineering__stg_core_schema2__table2_athena", }, { "type": "athena", @@ -334,7 +338,12 @@ "table": "table3", "follow_external_dependency": True, }, - {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "type": "athena", + "schema": "analytics_engineering", + "table": "seed_buyer_country_overwrite", + "name": "analytics_engineering__seed_buyer_country_overwrite_athena", + }, { "name": "analytics_engineering__model2_athena", "schema": "analytics_engineering", @@ -351,12 +360,10 @@ { "type": "dummy", "name": "int_model3", - "follow_external_dependency": True, }, { "type": "dummy", "name": "int_model2", - "follow_external_dependency": True, }, { "name": "analytics_engineering__stg_core_schema1__table1_athena", @@ -388,7 +395,12 @@ "table": "table3", "type": "athena", }, - {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "type": "athena", + "schema": "analytics_engineering", + "table": "seed_buyer_country_overwrite", + "name": "analytics_engineering__seed_buyer_country_overwrite_athena", + } ] EXPECTED_DAGGER_OUTPUTS = [ @@ -408,13 +420,26 @@ EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS = [ { - "type": "dummy", - "name": "stg_core_schema2__table2", + "name": "analytics_engineering__stg_core_schema1__table1_athena", + "type": "athena", + "table": "stg_core_schema1__table1", + "schema": "analytics_engineering", + }, + { + "type": "s3", + "name": "output_s3_path", + "bucket": "bucket1-data-lake", + "path": "path2/stg_core_schema1__table1", }, ] EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS = [ - {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "type": "athena", + "schema": "analytics_engineering", + "table": "seed_buyer_country_overwrite", + "name": "analytics_engineering__seed_buyer_country_overwrite_athena", + }, { "name": "analytics_engineering__stg_core_schema1__table1_athena", "type": "athena", diff --git a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py index 2538e25..ad6b912 100644 --- a/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py +++ b/tests/fixtures/modules/dbt_config_parser_fixtures_databricks.py @@ -112,7 +112,7 @@ }, "seed.main.seed_buyer_country_overwrite": { "database": "hive_metastore", - "schema": "datastg_preparation", + "schema": "data_preparation", "name": "seed_buyer_country_overwrite", "unique_id": "seed.main.seed_buyer_country_overwrite", "resource_type": "seed", @@ -211,8 +211,11 @@ DATABRICKS_EXPECTED_SEED_NODE = [ { - "type": "dummy", - "name": "seed_buyer_country_overwrite", + "type": "databricks", + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "seed_buyer_country_overwrite", + "name": "hive_metastore__data_preparation__seed_buyer_country_overwrite_databricks", } ] @@ -220,24 +223,25 @@ { "type": "dummy", "name": "int_model3", - "follow_external_dependency": True, }, { "type": "dummy", "name": "int_model2", - "follow_external_dependency": True, }, { - "type": "dummy", - "name": "seed_buyer_country_overwrite", + "type": "databricks", + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "seed_buyer_country_overwrite", + "name": "hive_metastore__data_preparation__seed_buyer_country_overwrite_databricks", }, { "type": "databricks", - "follow_external_dependency": True, "catalog": "hive_metastore", "schema": "data_preparation", "table": "stg_core_schema1__table1", "name": "hive_metastore__data_preparation__stg_core_schema1__table1_databricks", + "follow_external_dependency": True, }, { "type": "s3", @@ -260,9 +264,11 @@ "type": "s3", }, { - "name": "stg_core_schema2__table2", - "type": "dummy", - "follow_external_dependency": True, + "type": "databricks", + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "stg_core_schema2__table2", + "name": "hive_metastore__data_preparation__stg_core_schema2__table2_databricks", }, { "type": "athena", @@ -284,16 +290,17 @@ { "type": "dummy", "name": "int_model3", - "follow_external_dependency": True, }, { "type": "dummy", "name": "int_model2", - "follow_external_dependency": True, }, { - "type": "dummy", - "name": "seed_buyer_country_overwrite", + "type": "databricks", + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "seed_buyer_country_overwrite", + "name": "hive_metastore__data_preparation__seed_buyer_country_overwrite_databricks", }, { "type": "databricks", @@ -330,9 +337,11 @@ DATABRICKS_EXPECTED_DAGGER_INPUTS = [ { - "name": "stg_core_schema2__table2", - "type": "dummy", - "follow_external_dependency": True, + "type": "databricks", + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "stg_core_schema2__table2", + "name": "hive_metastore__data_preparation__stg_core_schema2__table2_databricks", }, { "type": "athena", @@ -348,7 +357,13 @@ "table": "table3", "follow_external_dependency": True, }, - {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "type": "databricks", + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "seed_buyer_country_overwrite", + "name": "hive_metastore__data_preparation__seed_buyer_country_overwrite_databricks", + }, { "name": "marts__analytics_engineering__model2_databricks", "catalog": "marts", @@ -366,12 +381,10 @@ { "type": "dummy", "name": "int_model3", - "follow_external_dependency": True, }, { "type": "dummy", "name": "int_model2", - "follow_external_dependency": True, }, { "type": "databricks", @@ -404,11 +417,23 @@ "table": "table3", "type": "athena", }, - {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "type": "databricks", + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "seed_buyer_country_overwrite", + "name": "hive_metastore__data_preparation__seed_buyer_country_overwrite_databricks", + }, ] DATABRICKS_EXPECTED_DBT_INT_MODEL_DAGGER_INPUTS = [ - {"name": "seed_buyer_country_overwrite", "type": "dummy"}, + { + "type": "databricks", + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "seed_buyer_country_overwrite", + "name": "hive_metastore__data_preparation__seed_buyer_country_overwrite_databricks", + }, { "type": "databricks", "follow_external_dependency": True, @@ -449,7 +474,22 @@ DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS = [ { - "type": "dummy", - "name": "stg_core_schema2__table2", + "type": "databricks", + "catalog": "hive_metastore", + "schema": "data_preparation", + "table": "stg_core_schema1__table1", + "name": "hive_metastore__data_preparation__stg_core_schema1__table1_databricks", }, + { + "type": "s3", + "name": "output_s3_path", + "bucket": "chodata-data-lake", + "path": "analytics_warehouse/data/preparation/stg_core_schema1__table1", + }, + { + 'type': 'athena', + 'schema': 'data_preparation', + 'table': 'stg_core_schema1__table1', + 'name': 'data_preparation__stg_core_schema1__table1_athena' + } ] diff --git a/tests/utilities/test_dbt_config_parser.py b/tests/utilities/test_dbt_config_parser.py index d401e4b..d4d9028 100644 --- a/tests/utilities/test_dbt_config_parser.py +++ b/tests/utilities/test_dbt_config_parser.py @@ -78,13 +78,14 @@ def test_generate_io_inputs(self): ] for mock_input, expected_output in fixtures: result, _ = self._dbt_config_parser.generate_dagger_io(mock_input) - + print(f"result: {result}") + print(f"expected_output: {expected_output}") self.assertListEqual(result, expected_output) def test_generate_io_outputs(self): fixtures = [ ("model1", EXPECTED_DAGGER_OUTPUTS), - ("stg_core_schema2__table2", EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS), + ("stg_core_schema1__table1", EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS), ] for mock_input, expected_output in fixtures: _, result = self._dbt_config_parser.generate_dagger_io(mock_input) @@ -161,11 +162,10 @@ def test_generate_io_outputs(self): fixtures = [ ("model1", DATABRICKS_EXPECTED_DAGGER_OUTPUTS), ( - "stg_core_schema2__table2", + "stg_core_schema1__table1", DATABRICKS_EXPECTED_DBT_STAGING_MODEL_DAGGER_OUTPUTS, ), ] for mock_input, expected_output in fixtures: _, result = self._dbt_config_parser.generate_dagger_io(mock_input) - self.assertListEqual(result, expected_output) From 698358dfcd9231f6097d44736c738ab4279e9b29 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Thu, 4 Jul 2024 14:23:36 +0200 Subject: [PATCH 089/134] Module generation with generalised jinja parameters --- dagger/cli/module.py | 21 +++++++++++++++++++-- dagger/utilities/module.py | 4 +++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/dagger/cli/module.py b/dagger/cli/module.py index 67fca87..931e809 100644 --- a/dagger/cli/module.py +++ b/dagger/cli/module.py @@ -1,17 +1,34 @@ import click from dagger.utilities.module import Module from dagger.utils import Printer +import json +def parse_key_value(ctx, param, value): + #print('YYY', value) + if not value: + return {} + key_value_dict = {} + for pair in value: + try: + key, val_file_path = pair.split('=', 1) + #print('YYY', key, val_file_path, pair) + val = json.load(open(val_file_path)) + key_value_dict[key] = val + except ValueError: + raise click.BadParameter(f"Key-value pair '{pair}' is not in the format key=value") + return key_value_dict + @click.command() @click.option("--config_file", "-c", help="Path to module config file") @click.option("--target_dir", "-t", help="Path to directory to generate the task configs to") -def generate_tasks(config_file: str, target_dir: str) -> None: +@click.option("--jinja_parameters", "-j", callback=parse_key_value, multiple=True, default=None, help="Path to jinja parameters json file in the format: =") +def generate_tasks(config_file: str, target_dir: str, jinja_parameters: dict) -> None: """ Generating tasks for a module based on config """ - module = Module(config_file, target_dir) + module = Module(config_file, target_dir, jinja_parameters) module.generate_task_configs() Printer.print_success("Tasks are successfully generated") diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index ff1329f..242e2f5 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -14,7 +14,7 @@ class Module: - def __init__(self, path_to_config, target_dir): + def __init__(self, path_to_config, target_dir, jinja_parameters): self._directory = path.dirname(path_to_config) self._target_dir = target_dir or "./" self._path_to_config = path_to_config @@ -29,6 +29,7 @@ def __init__(self, path_to_config, target_dir): self._branches_to_generate = config["branches_to_generate"] self._override_parameters = config.get("override_parameters", {}) self._default_parameters = config.get("default_parameters", {}) + self._jinja_parameters = jinja_parameters @staticmethod def read_yaml(yaml_str): @@ -76,6 +77,7 @@ def generate_task_configs(self): template_parameters.update(self._default_parameters or {}) template_parameters.update(attrs) template_parameters['branch_name'] = branch_name + template_parameters.update(self._jinja_parameters) dbt_manifest = None if "dbt" in self._tasks.keys(): From cd12ddabc2a6991a54cab61b274335b5b67a63af Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Thu, 4 Jul 2024 16:19:44 +0200 Subject: [PATCH 090/134] Now it's possible to assign task to task groups --- dagger/dag_creator/airflow/operator_creator.py | 13 +++++++++++++ dagger/pipeline/task.py | 11 +++++++++++ 2 files changed, 24 insertions(+) diff --git a/dagger/dag_creator/airflow/operator_creator.py b/dagger/dag_creator/airflow/operator_creator.py index fc46234..b6aa036 100644 --- a/dagger/dag_creator/airflow/operator_creator.py +++ b/dagger/dag_creator/airflow/operator_creator.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from datetime import timedelta +from airflow.utils.task_group import TaskGroup TIMEDELTA_PARAMETERS = ['execution_timeout'] @@ -11,6 +12,15 @@ def __init__(self, task, dag): self._template_parameters = {} self._airflow_parameters = {} + def _get_existing_task_group_or_create_new(self): + group_id = self._task.task_group + if self._dag.task_group: + for group in self._dag.task_group.children.values(): + if isinstance(group, TaskGroup) and group.group_id == group_id: + return group + + return TaskGroup(group_id=group_id, dag=self._dag) + @abstractmethod def _create_operator(self, kwargs): raise NotImplementedError @@ -34,6 +44,9 @@ def _update_airflow_parameters(self): if self._task.timeout_in_seconds: self._airflow_parameters["execution_timeout"] = self._task.timeout_in_seconds + if self._task.task_group: + self._airflow_parameters["task_group"] = self._get_existing_task_group_or_create_new() + self._fix_timedelta_parameters() def create_operator(self): diff --git a/dagger/pipeline/task.py b/dagger/pipeline/task.py index 26235bd..ce07aec 100644 --- a/dagger/pipeline/task.py +++ b/dagger/pipeline/task.py @@ -36,6 +36,12 @@ def init_attributes(cls, orig_cls): comment="Use dagger init-io cli", ), Attribute(attribute_name="pool", required=False), + Attribute( + attribute_name="task_group", + required=False, + format_help=str, + comment="Task group name", + ), Attribute( attribute_name="timeout_in_seconds", required=False, @@ -73,6 +79,7 @@ def __init__(self, name: str, pipeline_name, pipeline, config: dict): self._outputs = [] self._pool = self.parse_attribute("pool") or self.default_pool self._timeout_in_seconds = self.parse_attribute("timeout_in_seconds") + self._task_group = self.parse_attribute("task_group") self.process_inputs(config["inputs"]) self.process_outputs(config["outputs"]) @@ -137,6 +144,10 @@ def pool(self): def timeout_in_seconds(self): return self._timeout_in_seconds + @property + def task_group(self): + return self._task_group + def add_input(self, task_input: IO): _logger.info("Adding input: %s to task: %s", task_input.name, self._name) self._inputs.append(task_input) From 0e6a6a638838ea14ff1607ee26a0ef9cb6778fbe Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Thu, 4 Jul 2024 17:54:32 +0200 Subject: [PATCH 091/134] Adding default value to the parameter --- dagger/utilities/module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 242e2f5..8697efa 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -14,7 +14,7 @@ class Module: - def __init__(self, path_to_config, target_dir, jinja_parameters): + def __init__(self, path_to_config, target_dir, jinja_parameters=None): self._directory = path.dirname(path_to_config) self._target_dir = target_dir or "./" self._path_to_config = path_to_config @@ -29,7 +29,7 @@ def __init__(self, path_to_config, target_dir, jinja_parameters): self._branches_to_generate = config["branches_to_generate"] self._override_parameters = config.get("override_parameters", {}) self._default_parameters = config.get("default_parameters", {}) - self._jinja_parameters = jinja_parameters + self._jinja_parameters = jinja_parameters or {} @staticmethod def read_yaml(yaml_str): From 8d0e23feaa6f324901440d76b68574904c039c81 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Wed, 6 Nov 2024 13:32:02 +0100 Subject: [PATCH 092/134] removed custom dbt task generation logic from Module --- dagger/utilities/module.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 8697efa..4172fce 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -79,20 +79,6 @@ def generate_task_configs(self): template_parameters['branch_name'] = branch_name template_parameters.update(self._jinja_parameters) - dbt_manifest = None - if "dbt" in self._tasks.keys(): - if template_parameters.get("profile_name") == "athena": - self._dbt_module = AthenaDBTConfigParser(template_parameters) - if template_parameters.get("profile_name") == "databricks": - self._dbt_module = DatabricksDBTConfigParser(template_parameters) - - dbt_manifest = {} - dbt_manifest['nodes'] = self._dbt_module.nodes_in_manifest - dbt_manifest['sources'] = self._dbt_module.sources_in_manifest - - template_parameters["dbt_manifest"] = dbt_manifest - template_parameters["dbt_default_schema"] = self._dbt_module.dbt_default_schema - for task, task_yaml in self._tasks.items(): task_name = f"{branch_name}_{task}" _logger.info(f"Generating task {task_name}") @@ -101,12 +87,6 @@ def generate_task_configs(self): ) task_dict = yaml.safe_load(task_str) - if task == "dbt": - inputs, outputs = self._dbt_module.generate_dagger_io(branch_name) - task_dict["inputs"] = inputs - task_dict["outputs"] = outputs - task_dict["task_parameters"]["select"] = branch_name - task_dict["autogenerated_by_dagger"] = self._path_to_config override_parameters = self._override_parameters or {} merge(task_dict, override_parameters.get(branch_name, {}).get(task, {})) From 77f6026a030ef1ee1821aa9feec0d8a1e19f1b25 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 10:23:46 +0100 Subject: [PATCH 093/134] add plugins path to dagger config --- dagger/conf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dagger/conf.py b/dagger/conf.py index 6b5488f..5322ce3 100644 --- a/dagger/conf.py +++ b/dagger/conf.py @@ -98,4 +98,8 @@ # Alert parameters alert_config = config.get('alert', None) or {} SLACK_TOKEN = alert_config.get('slack_token', None) -DEFAULT_ALERT = alert_config.get('default_alert', {"type": "slack", "channel": "#airflow-jobs", "mentions": None}) \ No newline at end of file +DEFAULT_ALERT = alert_config.get('default_alert', {"type": "slack", "channel": "#airflow-jobs", "mentions": None}) + +# Plugin parameters +plugin_config = config.get('plugin', None) or {} +PLUGIN_DIRS = [os.path.join(AIRFLOW_HOME, path) for path in plugin_config.get('paths', [])] From fbb648b034974c6b4cb9683aea80667fa1f60918 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 10:26:00 +0100 Subject: [PATCH 094/134] added function to load plugins --- dagger/utilities/module.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 4172fce..7aa38ba 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -1,15 +1,16 @@ +import importlib +import inspect import logging +import os +import pkgutil from os import path -from mergedeep import merge -from dagger.utilities.dbt_config_parser import ( - AthenaDBTConfigParser, - DatabricksDBTConfigParser, -) import jinja2 - import yaml +from dagger import conf +from mergedeep import merge + _logger = logging.getLogger("root") @@ -49,6 +50,29 @@ def read_task_config(self, task): exit(1) return content + @staticmethod + def load_plugins() -> dict: + """ + Dynamically load all classes(plugins) from the folders defined in the conf.PLUGIN_DIRS variable. + The folder contains all plugins that are part of the project. + Returns: + dict: A dictionary with the class name as key and the class object as value + """ + classes = {} + + for module_info in pkgutil.iter_modules(conf.PLUGIN_DIRS): + module_name = module_info.name + module_path = os.path.join(module_info.module_finder.path, f"{module_name}.py") + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + for name, obj in inspect.getmembers(module, inspect.isclass): + classes[f"{name}"] = obj + + return classes + + @staticmethod def replace_template_parameters(_task_str, _template_parameters): environment = jinja2.Environment() From 90bb16bfc0777bf7f0ca6fea2879298bba0d8bf3 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 10:26:12 +0100 Subject: [PATCH 095/134] load plugins and render jinja --- dagger/utilities/module.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 7aa38ba..41da171 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -76,6 +76,10 @@ def load_plugins() -> dict: @staticmethod def replace_template_parameters(_task_str, _template_parameters): environment = jinja2.Environment() + loaded_classes = Module.load_plugins() + for class_name, class_obj in loaded_classes.items(): + environment.globals[class_name] = class_obj + template = environment.from_string(_task_str) rendered_task = template.render(_template_parameters) From 1d09c57ef9e5a570e24052f0f4fe8c4f106c7262 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 11:49:48 +0100 Subject: [PATCH 096/134] iterate over multiple folders and their subfolders --- dagger/utilities/module.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 41da171..4a85d37 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -60,15 +60,18 @@ def load_plugins() -> dict: """ classes = {} - for module_info in pkgutil.iter_modules(conf.PLUGIN_DIRS): - module_name = module_info.name - module_path = os.path.join(module_info.module_finder.path, f"{module_name}.py") - spec = importlib.util.spec_from_file_location(module_name, module_path) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - - for name, obj in inspect.getmembers(module, inspect.isclass): - classes[f"{name}"] = obj + for plugin_path in conf.PLUGIN_DIRS: + for root, dirs, files in os.walk(plugin_path): + for plugin_file in files: + if plugin_file.endswith(".py") and not plugin_file.startswith("__init__"): + module_name = plugin_file.replace(".py", "") + module_path = os.path.join(root, plugin_file) + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + for name, obj in inspect.getmembers(module, inspect.isclass): + classes[f"{name}"] = obj return classes From da0528ada6d08159470ab67289ff9ce5bb235d51 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 12:50:55 +0100 Subject: [PATCH 097/134] exclude all files starting with __ --- dagger/utilities/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 4a85d37..6bf2c35 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -63,7 +63,7 @@ def load_plugins() -> dict: for plugin_path in conf.PLUGIN_DIRS: for root, dirs, files in os.walk(plugin_path): for plugin_file in files: - if plugin_file.endswith(".py") and not plugin_file.startswith("__init__"): + if plugin_file.endswith(".py") and not plugin_file.startswith("__"): module_name = plugin_file.replace(".py", "") module_path = os.path.join(root, plugin_file) spec = importlib.util.spec_from_file_location(module_name, module_path) From a8aeff7612e6c67453e163bb175288bb400fb31f Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 12:51:03 +0100 Subject: [PATCH 098/134] added plugin to dagger config --- dagger/dagger_config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dagger/dagger_config.yaml b/dagger/dagger_config.yaml index 3366828..69c3d54 100644 --- a/dagger/dagger_config.yaml +++ b/dagger/dagger_config.yaml @@ -58,3 +58,7 @@ alert: # type: slack # channel: "#airflow-jobs" # mentions: + +plugin: +# paths: +# - plugins From 6addf9857cd8f8cc6a50a71532b9f0aa201f52f1 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 12:52:05 +0100 Subject: [PATCH 099/134] added logging for plugins --- dagger/conf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dagger/conf.py b/dagger/conf.py index 5322ce3..5036da3 100644 --- a/dagger/conf.py +++ b/dagger/conf.py @@ -101,5 +101,6 @@ DEFAULT_ALERT = alert_config.get('default_alert', {"type": "slack", "channel": "#airflow-jobs", "mentions": None}) # Plugin parameters -plugin_config = config.get('plugin', None) or {} +plugin_config = config.get('plugin', {}) PLUGIN_DIRS = [os.path.join(AIRFLOW_HOME, path) for path in plugin_config.get('paths', [])] +logging.info(f"All Python classes will be loaded as plugins from the following directories: {PLUGIN_DIRS}") From b8feafb4506ee35dc176dd0c2d530b35a236c69e Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 15:44:58 +0100 Subject: [PATCH 100/134] fix --- dagger/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/conf.py b/dagger/conf.py index 5036da3..cbb075d 100644 --- a/dagger/conf.py +++ b/dagger/conf.py @@ -101,6 +101,6 @@ DEFAULT_ALERT = alert_config.get('default_alert', {"type": "slack", "channel": "#airflow-jobs", "mentions": None}) # Plugin parameters -plugin_config = config.get('plugin', {}) +plugin_config = config.get('plugin', None) or {} PLUGIN_DIRS = [os.path.join(AIRFLOW_HOME, path) for path in plugin_config.get('paths', [])] logging.info(f"All Python classes will be loaded as plugins from the following directories: {PLUGIN_DIRS}") From d8c6384257d3efd8f464c72fee9692621c01dd79 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 15:45:45 +0100 Subject: [PATCH 101/134] added tests for plugins --- .../sample_folder/sample_folder_plugin.py | 5 ++ tests/fixtures/plugins/sample_plugin.py | 0 tests/utilities/test_plugins.py | 60 +++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100644 tests/fixtures/plugins/sample_folder/sample_folder_plugin.py create mode 100644 tests/fixtures/plugins/sample_plugin.py create mode 100644 tests/utilities/test_plugins.py diff --git a/tests/fixtures/plugins/sample_folder/sample_folder_plugin.py b/tests/fixtures/plugins/sample_folder/sample_folder_plugin.py new file mode 100644 index 0000000..c8b931e --- /dev/null +++ b/tests/fixtures/plugins/sample_folder/sample_folder_plugin.py @@ -0,0 +1,5 @@ +class SampleFolderPlugin: + @staticmethod + def get_inputs(): + return [{"name": "sample_folder_plugin_task", "type": "dummy"}] + diff --git a/tests/fixtures/plugins/sample_plugin.py b/tests/fixtures/plugins/sample_plugin.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/utilities/test_plugins.py b/tests/utilities/test_plugins.py new file mode 100644 index 0000000..1591317 --- /dev/null +++ b/tests/utilities/test_plugins.py @@ -0,0 +1,60 @@ +import inspect +import shutil +import unittest +from pathlib import Path +from unittest.mock import patch +import os +import importlib.util + +import jinja2 + +from dagger.utilities.module import Module # Adjust this import according to your actual module structure +from dagger import conf + +TESTS_ROOT = Path(__file__).parent.parent + +class TestLoadPlugins(unittest.TestCase): + + def setUp(self): + self._jinja_environment = jinja2.Environment() + loaded_classes = Module.load_plugins() + for class_name, class_obj in loaded_classes.items(): + self._jinja_environment.globals[class_name] = class_obj + + self._template = self._jinja_environment.from_string("inputs: {{ SampleFolderPlugin.get_inputs() }}") + + @patch("dagger.conf.PLUGIN_DIRS", new=[]) + @patch("os.walk") + def test_load_plugins_no_plugin_dir(self, mock_os_walk): + # Simulate os.walk returning no Python files + mock_os_walk.return_value = [("/fake/plugin/dir", [], [])] + + result = Module.load_plugins() + + # Expecting an empty dictionary since no plugins were found + self.assertEqual(result, {}) + + @patch("dagger.conf.PLUGIN_DIRS", new=[str(TESTS_ROOT.joinpath("fixtures/plugins"))]) + def test_load_plugins(self): + + result = Module.load_plugins() + for name, plugin_class in result.items(): + result[name] = str(plugin_class) + + expected_classes = {"SampleFolderPlugin": ""} + + self.assertEqual(result, expected_classes) + + @patch("dagger.conf.PLUGIN_DIRS", new=[str(TESTS_ROOT.joinpath("fixtures/plugins"))]) + def test_load_plugins_in_jinja(self): + result = Module.load_plugins() + for class_name, class_obj in result.items(): + self._jinja_environment.globals[class_name] = class_obj + + rendered_task = self._template.render() + expected_task = "inputs: [{'name': 'sample_folder_plugin_task', 'type': 'dummy'}]" + + self.assertEqual(rendered_task, expected_task) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From a3d7f47b780f7d7a5ddcb021028b7976692dc292 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 17:49:17 +0100 Subject: [PATCH 102/134] refactor code * add the plugins into jinja env directly --- dagger/utilities/module.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 6bf2c35..169123e 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -3,7 +3,7 @@ import logging import os import pkgutil -from os import path +from os import path, environ import jinja2 import yaml @@ -51,7 +51,7 @@ def read_task_config(self, task): return content @staticmethod - def load_plugins() -> dict: + def load_plugins_to_jinja_environment(environment: jinja2.Environment) -> jinja2.Environment: """ Dynamically load all classes(plugins) from the folders defined in the conf.PLUGIN_DIRS variable. The folder contains all plugins that are part of the project. @@ -71,18 +71,14 @@ def load_plugins() -> dict: spec.loader.exec_module(module) for name, obj in inspect.getmembers(module, inspect.isclass): - classes[f"{name}"] = obj - - return classes + environment.globals[f"{name}"] = obj + return environment @staticmethod def replace_template_parameters(_task_str, _template_parameters): environment = jinja2.Environment() - loaded_classes = Module.load_plugins() - for class_name, class_obj in loaded_classes.items(): - environment.globals[class_name] = class_obj - + environment = Module.load_plugins_to_jinja_environment(environment) template = environment.from_string(_task_str) rendered_task = template.render(_template_parameters) From 450b54491fd9d8f7d827cd72f8aabd8073c9229b Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Thu, 7 Nov 2024 17:55:34 +0100 Subject: [PATCH 103/134] refactor tests --- tests/utilities/test_plugins.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/tests/utilities/test_plugins.py b/tests/utilities/test_plugins.py index 1591317..6a4b343 100644 --- a/tests/utilities/test_plugins.py +++ b/tests/utilities/test_plugins.py @@ -8,7 +8,7 @@ import jinja2 -from dagger.utilities.module import Module # Adjust this import according to your actual module structure +from dagger.utilities.module import Module from dagger import conf TESTS_ROOT = Path(__file__).parent.parent @@ -17,10 +17,6 @@ class TestLoadPlugins(unittest.TestCase): def setUp(self): self._jinja_environment = jinja2.Environment() - loaded_classes = Module.load_plugins() - for class_name, class_obj in loaded_classes.items(): - self._jinja_environment.globals[class_name] = class_obj - self._template = self._jinja_environment.from_string("inputs: {{ SampleFolderPlugin.get_inputs() }}") @patch("dagger.conf.PLUGIN_DIRS", new=[]) @@ -29,27 +25,19 @@ def test_load_plugins_no_plugin_dir(self, mock_os_walk): # Simulate os.walk returning no Python files mock_os_walk.return_value = [("/fake/plugin/dir", [], [])] - result = Module.load_plugins() + result_environment = Module.load_plugins_to_jinja_environment(self._jinja_environment) - # Expecting an empty dictionary since no plugins were found - self.assertEqual(result, {}) + self.assertNotIn("SampleFolderPlugin", result_environment.globals) @patch("dagger.conf.PLUGIN_DIRS", new=[str(TESTS_ROOT.joinpath("fixtures/plugins"))]) def test_load_plugins(self): + result_environment = Module.load_plugins_to_jinja_environment(self._jinja_environment) - result = Module.load_plugins() - for name, plugin_class in result.items(): - result[name] = str(plugin_class) - - expected_classes = {"SampleFolderPlugin": ""} - - self.assertEqual(result, expected_classes) + self.assertIn("SampleFolderPlugin", result_environment.globals.keys()) @patch("dagger.conf.PLUGIN_DIRS", new=[str(TESTS_ROOT.joinpath("fixtures/plugins"))]) - def test_load_plugins_in_jinja(self): - result = Module.load_plugins() - for class_name, class_obj in result.items(): - self._jinja_environment.globals[class_name] = class_obj + def test_load_plugins_render_jinja(self): + result_environment = Module.load_plugins_to_jinja_environment(self._jinja_environment) rendered_task = self._template.render() expected_task = "inputs: [{'name': 'sample_folder_plugin_task', 'type': 'dummy'}]" From 98a6b55db23a92379289e362c95be54660eefb02 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 8 Nov 2024 13:32:39 +0100 Subject: [PATCH 104/134] exclude test folders from directory walk --- dagger/utilities/module.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dagger/utilities/module.py b/dagger/utilities/module.py index 169123e..7f33690 100644 --- a/dagger/utilities/module.py +++ b/dagger/utilities/module.py @@ -58,12 +58,11 @@ def load_plugins_to_jinja_environment(environment: jinja2.Environment) -> jinja2 Returns: dict: A dictionary with the class name as key and the class object as value """ - classes = {} - for plugin_path in conf.PLUGIN_DIRS: for root, dirs, files in os.walk(plugin_path): + dirs[:] = [directory for directory in dirs if not directory.lower().startswith("test")] for plugin_file in files: - if plugin_file.endswith(".py") and not plugin_file.startswith("__"): + if plugin_file.endswith(".py") and not (plugin_file.startswith("__") or plugin_file.startswith("test")): module_name = plugin_file.replace(".py", "") module_path = os.path.join(root, plugin_file) spec = importlib.util.spec_from_file_location(module_name, module_path) From ab6ff156f784b137f115187a736ce70aba4af39e Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Fri, 8 Nov 2024 13:44:12 +0100 Subject: [PATCH 105/134] remove unused imports --- tests/utilities/test_plugins.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/utilities/test_plugins.py b/tests/utilities/test_plugins.py index 6a4b343..353a407 100644 --- a/tests/utilities/test_plugins.py +++ b/tests/utilities/test_plugins.py @@ -1,15 +1,10 @@ -import inspect -import shutil import unittest from pathlib import Path from unittest.mock import patch -import os -import importlib.util import jinja2 from dagger.utilities.module import Module -from dagger import conf TESTS_ROOT = Path(__file__).parent.parent From 0fa5389968dd8342f985132ddae8984248833f41 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Mon, 11 Nov 2024 15:56:35 +0100 Subject: [PATCH 106/134] update readme --- README.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/README.md b/README.md index 2a161be..d70a203 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,40 @@ flowchart TD; ``` +Plugins for dagger +------- + +### Overview +Dagger now supports a plugin system that allows users to extend its functionality by adding custom Python classes. These plugins are integrated into the Jinja2 templating engine, enabling dynamic rendering of task configuration templates. +### Purpose +The plugin system allows users to define Python classes that can be loaded into the Jinja2 environment. When functions from these classes are invoked within a task configuration template, they are rendered dynamically using Jinja2. This feature enhances the flexibility of task configurations by allowing custom logic to be embedded directly in the templates. + +### Usage +1. **Creating a Plugin:** To create a new plugin, define a Python class in a folder(for example `plugins/sample_plugin/sample_plugin.py`) with the desired methods. For example: +```python +class MyCustomPlugin: + def generate_input(self, branch_name): + return [{"name": f"{branch_name}", "type": "dummy"}] +``` +This class defines a `generate_input` method that takes the branch_name from the module config and returns a dummy dagger task. +2. **Loading the Plugin into Dagger:** To load this plugin into Dagger's Jinja2 environment, you need to register it in your `dagger_config.yaml`: +```yaml +# pipeline.yaml +plugin: + paths: + - plugins # all Python classes within this path will be loaded into the Jinja environment +``` + +3. **Using Plugin Methods in Templates:** Once the plugin is loaded, you can call its methods from within any Jinja2 template in your task configurations: +```yaml +# task_configuration.yaml +type: batch +description: sample task +inputs: # format: list | Use dagger init-io cli + {{ MyCustomPlugin.generate_input("dummy_input") }} +``` + + Credits ------- From e7f08cd0fd841b62d755878325c8934bf3fb2317 Mon Sep 17 00:00:00 2001 From: Kiran Vasudev Date: Tue, 12 Nov 2024 15:43:21 +0100 Subject: [PATCH 107/134] fix readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d70a203..8e59257 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,7 @@ class MyCustomPlugin: return [{"name": f"{branch_name}", "type": "dummy"}] ``` This class defines a `generate_input` method that takes the branch_name from the module config and returns a dummy dagger task. + 2. **Loading the Plugin into Dagger:** To load this plugin into Dagger's Jinja2 environment, you need to register it in your `dagger_config.yaml`: ```yaml # pipeline.yaml From 7d871b6ce7deb6f016e5e5299510325d17db9142 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Thu, 2 Jan 2025 13:15:13 +0100 Subject: [PATCH 108/134] Adding new reverse etl operator to dagger inherited from batch operator --- .../operator_creators/reverse_etl_creator.py | 55 +++++ .../airflow/operators/reverse_etl_batch.py | 8 + dagger/pipeline/tasks/reverse_etl_task.py | 203 ++++++++++++++++++ 3 files changed, 266 insertions(+) create mode 100644 dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py create mode 100644 dagger/dag_creator/airflow/operators/reverse_etl_batch.py create mode 100644 dagger/pipeline/tasks/reverse_etl_task.py diff --git a/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py b/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py new file mode 100644 index 0000000..f6f9095 --- /dev/null +++ b/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py @@ -0,0 +1,55 @@ +import base64 + +from dagger.dag_creator.airflow.operator_creators.batch_creator import BatchCreator +import json + + +class ReverseEtlCreator(BatchCreator): + ref_name = "reverse_etl" + + def __init__(self, task, dag): + super().__init__(task, dag) + + self._assume_role_arn = task.assume_role_arn + self._num_threads = task.num_threads + self._batch_size = task.batch_size + self._absolute_job_name = task.absolute_job_name + self._primary_id_column = task.primary_id_column + self._secondary_id_column = task.secondary_id_column + self._custom_id_column = task.custom_id_column + self._model_name = task.model_name + self._project_name = task.project_name + self._is_deleted_column = task.is_deleted_column + self._hash_column = task.hash_column + self._updated_at_column = task.updated_at_column + self._from_time = task.from_time + self._days_to_live = task.days_to_live + + def _generate_command(self): + command = [self._task.executable_prefix, self._task.executable] + + + command.append(f"--num_threads={self._num_threads}") + command.append(f"--batch_size={self._batch_size}") + command.append(f"--primary_id_column={self._primary_id_column}") + command.append(f"--model_name={self._model_name}") + command.append(f"--project_name={self._project_name}") + + if self._assume_role_arn: + command.append(f"--assume_role_arn={self._assume_role_arn}") + if self._secondary_id_column: + command.append(f"--secondary_id_column={self._secondary_id_column}") + if self._custom_id_column: + command.append(f"--custom_id_column={self._custom_id_column}") + if self._is_deleted_column: + command.append(f"--is_deleted_column={self._is_deleted_column}") + if self._hash_column: + command.append(f"--hash_column={self._hash_column}") + if self._updated_at_column: + command.append(f"--updated_at_column={self._updated_at_column}") + if self._from_time: + command.append(f"--from_time={self._from_time}") + if self._days_to_live: + command.append(f"--days_to_live={self._days_to_live}") + + return command diff --git a/dagger/dag_creator/airflow/operators/reverse_etl_batch.py b/dagger/dag_creator/airflow/operators/reverse_etl_batch.py new file mode 100644 index 0000000..abb775b --- /dev/null +++ b/dagger/dag_creator/airflow/operators/reverse_etl_batch.py @@ -0,0 +1,8 @@ +from dagger.dag_creator.airflow.operators.awsbatch_operator import AWSBatchOperator + +class ReverseEtlBatchOperator(AWSBatchOperator): + custom_operator_name = 'ReverseETL' + ui_color = "#f0ede4" + + def __init__(self, *args, **kwargs): + super().__init__(args, kwargs) diff --git a/dagger/pipeline/tasks/reverse_etl_task.py b/dagger/pipeline/tasks/reverse_etl_task.py new file mode 100644 index 0000000..1e21d49 --- /dev/null +++ b/dagger/pipeline/tasks/reverse_etl_task.py @@ -0,0 +1,203 @@ +from dagger.pipeline.tasks.batch_task import BatchTask +from dagger.utilities.config_validator import Attribute + +class ReverseEtlTask(BatchTask): + ref_name = "reverse_etl" + + @classmethod + def init_attributes(cls, orig_cls): + cls.add_config_attributes( + [ + Attribute( + attribute_name="executable_prefix", + required=False, + parent_fields=["task_parameters"], + comment="E.g.: python", + ), + Attribute( + attribute_name="executable", + required=False, + parent_fields=["task_parameters"], + comment="E.g.: my_code.py", + ), + Attribute( + attribute_name="assume_role_arn", + parent_fields=["task_parameters"], + required = False, + validator=str, + comment="The ARN of the role to assume before running the job", + ), + Attribute( + attribute_name="num_threads", + parent_fields=["task_parameters"], + required=False, + validator=int, + comment="The number of threads to use for the job", + ), + Attribute( + attribute_name="batch_size", + parent_fields=["task_parameters"], + required=False, + validator=int, + comment="The number of rows to fetch in each batch", + ), + Attribute( + attribute_name="primary_id_column", + parent_fields=["task_parameters"], + validator=str, + comment="The primary key column to use for the job", + ), + Attribute( + attribute_name="secondary_id_column", + parent_fields=["task_parameters"], + validator=str, + required=False, + comment="The secondary key column to use for the job", + ), + Attribute( + attribute_name="custom_id_column", + parent_fields=["task_parameters"], + validator=str, + required=False, + comment="The custom key column to use for the job", + ), + Attribute( + attribute_name="model_name", + parent_fields=["task_parameters"], + validator=str, + required=False, + comment="The name of the model. This is going to be a column on the target table. By default it is" + " set to the name of the input .", + ), + Attribute( + attribute_name="project_name", + parent_fields=["task_parameters"], + validator=str, + required=False, + comment="The name of the project. This is going to be a column on the target table. By default it is" + " set to feature_store", + ), + Attribute( + attribute_name="is_deleted_column", + parent_fields=["task_parameters"], + validator=str, + required=False, + comment="The column that has the boolean flag to indicate if the row is deleted", + ), + Attribute( + attribute_name="hash_column", + parent_fields=["task_parameters"], + validator=str, + required=False, + comment="The column that has the the hash value of the row to be used to get the diff since " + "the last export. If provided, the from_time is required. It's mutually exclusive with " + "updated_at_column", + ), + Attribute( + attribute_name="updated_at_column", + parent_fields=["task_parameters"], + validator=str, + required=False, + comment="The column that has the last updated timestamp of the row to be used to get the diff " + "since the last export. If provided, the from_time is required. It's mutually exclusive " + "with hash_column", + ), + Attribute( + attribute_name="from_time", + parent_fields=["task_parameters"], + validator=str, + required=False, + comment="Timestamp in YYYY-mm-ddTHH:MM format. It is used for incremental loads." + "It's required when hash_column or updated_at_column is provided", + ), + Attribute( + attribute_name="days_to_live", + parent_fields=["task_parameters"], + validator=str, + required=False, + comment="The number of days to keep the data in the table. If provided, the time_to_live attribute " + "will be set in dynamodb", + ), + + ] + ) + + def __init__(self, name, pipeline_name, pipeline, job_config): + super().__init__(name, pipeline_name, pipeline, job_config) + + self.executable = self.executable or "reverse_etl.py" + self.executable_prefix = self.executable_prefix or "python" + + self._assume_role_arn = self.parse_attribute("assume_role_arn") + self._num_threads = self.parse_attribute("num_threads") or 4 + self._batch_size = self.parse_attribute("batch_size") or 10000 + self._absolute_job_name = self._absolute_job_name or "common_batch_jobs/reverse_etl" + self._primary_id_column = self.parse_attribute("primary_id_column") + self._secondary_id_column = self.parse_attribute("secondary_id_column") + self._custom_id_column = self.parse_attribute("custom_id_column") + self._model_name = self.parse_attribute("model_name") + self._project_name = self.parse_attribute("project_name") or "feature_store" + self._is_deleted_column = self.parse_attribute("is_deleted_column") + self._hash_column = self.parse_attribute("hash_column") + self._updated_at_column = self.parse_attribute("updated_at_column") + self._from_time = self.parse_attribute("from_time") + self._days_to_live = self.parse_attribute("days_to_live") + + if self._hash_column and self._updated_at_column: + raise ValueError("hash_column and updated_at_column are mutually exclusive") + + if self._hash_column or self._updated_at_column: + if not self._from_time: + raise ValueError("from_time is required when hash_column or updated_at_column is provided") + + @property + def assume_role_arn(self): + return self._assume_role_arn + + @property + def num_threads(self): + return self._num_threads + + @@property + def batch_size(self): + return self._batch_size + + @property + def primary_id_column(self): + return self._primary_id_column + + @property + def secondary_id_column(self): + return self._secondary_id_column + + @property + def custom_id_column(self): + return self._custom_id_column + + @property + def model_name(self): + return self._model_name + + @property + def project_name(self): + return self._project_name + + @property + def is_deleted_column(self): + return self._is_deleted_column + + @property + def hash_column(self): + return self._hash_column + + @property + def updated_at_column(self): + return self._updated_at_column + + @property + def from_time(self): + return self._from_time + + @property + def days_to_live(self): + return self._days_to_live From bfceb811de2bc837ff42c8c080205069ce4f60fb Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Thu, 2 Jan 2025 13:15:45 +0100 Subject: [PATCH 109/134] Registering the new operator with dagger --- dagger/dag_creator/airflow/operator_factory.py | 1 + dagger/pipeline/task_factory.py | 1 + 2 files changed, 2 insertions(+) diff --git a/dagger/dag_creator/airflow/operator_factory.py b/dagger/dag_creator/airflow/operator_factory.py index 706a737..f610f1e 100644 --- a/dagger/dag_creator/airflow/operator_factory.py +++ b/dagger/dag_creator/airflow/operator_factory.py @@ -10,6 +10,7 @@ redshift_load_creator, redshift_transform_creator, redshift_unload_creator, + reverse_etl_creator, spark_creator, sqoop_creator, ) diff --git a/dagger/pipeline/task_factory.py b/dagger/pipeline/task_factory.py index a9c5eef..d8a1e53 100644 --- a/dagger/pipeline/task_factory.py +++ b/dagger/pipeline/task_factory.py @@ -9,6 +9,7 @@ redshift_load_task, redshift_transform_task, redshift_unload_task, + reverse_etl_task, spark_task, sqoop_task, ) From 0b2ac50ac6b0d97628e715bba01e6ac8b0fbcead Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Thu, 2 Jan 2025 13:16:19 +0100 Subject: [PATCH 110/134] Small type fix to resolve broken cli help command --- dagger/pipeline/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/pipeline/task.py b/dagger/pipeline/task.py index ce07aec..d484e49 100644 --- a/dagger/pipeline/task.py +++ b/dagger/pipeline/task.py @@ -39,7 +39,7 @@ def init_attributes(cls, orig_cls): Attribute( attribute_name="task_group", required=False, - format_help=str, + format_help="str", comment="Task group name", ), Attribute( From e414b84cb9781a5e850c9e9d6cbb15416dbac375 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Thu, 2 Jan 2025 13:17:21 +0100 Subject: [PATCH 111/134] Adding the possibility that inherited operator can overwrite attribute of base operator --- dagger/utilities/config_validator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dagger/utilities/config_validator.py b/dagger/utilities/config_validator.py index 1d68f33..1f70c1b 100644 --- a/dagger/utilities/config_validator.py +++ b/dagger/utilities/config_validator.py @@ -98,10 +98,14 @@ def init_attributes_once(cls, orig_cls: str) -> None: cls.init_attributes(orig_cls) if parent_class.__name__ != "ConfigValidator": - cls.config_attributes[cls.__name__] = ( - cls.config_attributes[parent_class.__name__] - + cls.config_attributes[cls.__name__] - ) + parent_attributes = cls.config_attributes[parent_class.__name__] + current_attributes = cls.config_attributes[cls.__name__] + + merged_attributes = {attr.name: attr for attr in parent_attributes} + for attr in current_attributes: + merged_attributes[attr.name] = attr + + cls.config_attributes[cls.__name__] = list(merged_attributes.values()) attributes_lookup = {} for index, attribute in enumerate(cls.config_attributes[cls.__name__]): From 67a8142cbdd8a143870224f4cfa7616b68114d87 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Thu, 2 Jan 2025 14:38:33 +0100 Subject: [PATCH 112/134] Smaller fixes; syntax fix; Fixing command creation by extending the existing solution in base class --- .../operator_creators/reverse_etl_creator.py | 22 +++++++++++++++++-- .../airflow/operators/reverse_etl_batch.py | 3 --- dagger/pipeline/tasks/reverse_etl_task.py | 8 +++---- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py b/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py index f6f9095..be94f74 100644 --- a/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py @@ -1,6 +1,7 @@ import base64 from dagger.dag_creator.airflow.operator_creators.batch_creator import BatchCreator +from dagger.dag_creator.airflow.operators.reverse_etl_batch import ReverseEtlBatchOperator import json @@ -26,8 +27,7 @@ def __init__(self, task, dag): self._days_to_live = task.days_to_live def _generate_command(self): - command = [self._task.executable_prefix, self._task.executable] - + command = BatchCreator._generate_command(self) command.append(f"--num_threads={self._num_threads}") command.append(f"--batch_size={self._batch_size}") @@ -53,3 +53,21 @@ def _generate_command(self): command.append(f"--days_to_live={self._days_to_live}") return command + + def _create_operator(self, **kwargs): + overrides = self._task.overrides + overrides.update({"command": self._generate_command()}) + + job_name = self._validate_job_name(self._task.job_name, self._task.absolute_job_name) + batch_op = ReverseEtlBatchOperator( + dag=self._dag, + task_id=self._task.name, + job_name=self._task.name, + job_definition=job_name, + region_name=self._task.region_name, + job_queue=self._task.job_queue, + container_overrides=overrides, + awslogs_enabled=True, + **kwargs, + ) + return batch_op diff --git a/dagger/dag_creator/airflow/operators/reverse_etl_batch.py b/dagger/dag_creator/airflow/operators/reverse_etl_batch.py index abb775b..78c1619 100644 --- a/dagger/dag_creator/airflow/operators/reverse_etl_batch.py +++ b/dagger/dag_creator/airflow/operators/reverse_etl_batch.py @@ -3,6 +3,3 @@ class ReverseEtlBatchOperator(AWSBatchOperator): custom_operator_name = 'ReverseETL' ui_color = "#f0ede4" - - def __init__(self, *args, **kwargs): - super().__init__(args, kwargs) diff --git a/dagger/pipeline/tasks/reverse_etl_task.py b/dagger/pipeline/tasks/reverse_etl_task.py index 1e21d49..47cf555 100644 --- a/dagger/pipeline/tasks/reverse_etl_task.py +++ b/dagger/pipeline/tasks/reverse_etl_task.py @@ -31,14 +31,12 @@ def init_attributes(cls, orig_cls): attribute_name="num_threads", parent_fields=["task_parameters"], required=False, - validator=int, comment="The number of threads to use for the job", ), Attribute( attribute_name="batch_size", parent_fields=["task_parameters"], required=False, - validator=int, comment="The number of rows to fetch in each batch", ), Attribute( @@ -125,8 +123,8 @@ def init_attributes(cls, orig_cls): def __init__(self, name, pipeline_name, pipeline, job_config): super().__init__(name, pipeline_name, pipeline, job_config) - self.executable = self.executable or "reverse_etl.py" - self.executable_prefix = self.executable_prefix or "python" + self._executable = self.executable or "reverse_etl.py" + self._executable_prefix = self.executable_prefix or "python" self._assume_role_arn = self.parse_attribute("assume_role_arn") self._num_threads = self.parse_attribute("num_threads") or 4 @@ -158,7 +156,7 @@ def assume_role_arn(self): def num_threads(self): return self._num_threads - @@property + @property def batch_size(self): return self._batch_size From 0557bab09a16a35acd62df24f20c398b367af318 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Fri, 3 Jan 2025 12:25:18 +0100 Subject: [PATCH 113/134] Adding dynamo and sns io types --- dagger/pipeline/ios/dynamo_io.py | 60 ++++++++++++++++++++++++++++++++ dagger/pipeline/ios/sns_io.py | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 dagger/pipeline/ios/dynamo_io.py create mode 100644 dagger/pipeline/ios/sns_io.py diff --git a/dagger/pipeline/ios/dynamo_io.py b/dagger/pipeline/ios/dynamo_io.py new file mode 100644 index 0000000..c10459c --- /dev/null +++ b/dagger/pipeline/ios/dynamo_io.py @@ -0,0 +1,60 @@ +from dagger.pipeline.io import IO +from dagger.utilities.config_validator import Attribute + + +class DynamoIO(IO): + ref_name = "dynamo" + + @classmethod + def init_attributes(cls, orig_cls): + cls.add_config_attributes( + [ + Attribute( + attribute_name="account_id", + required=False, + comment="Only needed for cross account dynamo tables" + ), + Attribute( + attribute_name="region", + required=False, + comment="Only needed for cross region dynamo tables" + ), + Attribute( + attribute_name="table", + comment="The name of the dynamo table" + ), + ] + ) + + def __init__(self, io_config, config_location): + super().__init__(io_config, config_location) + + self._account_id = self.parse_attribute("account_id") + self._region = self.parse_attribute("region") + self._table = self.parse_attribute("table") + + def alias(self): + return f"dynamo://{self._account_id or ''}/{self._region or ''}/{self._table}" + + @property + def rendered_name(self): + if not self._account_id and not self._region: + return self._table + else: + return ":".join([self._account_id or '', self._region or '', self._table]) + + @property + def airflow_name(self): + return f"dynamo-{'-'.join([name_part for name_part in [self._account_id, self._region, self._table] if name_part])}" + + @property + def account_id(self): + return self._account_id + + @property + def region(self): + return self._region + + @property + def table(self): + return self._table diff --git a/dagger/pipeline/ios/sns_io.py b/dagger/pipeline/ios/sns_io.py new file mode 100644 index 0000000..14b4112 --- /dev/null +++ b/dagger/pipeline/ios/sns_io.py @@ -0,0 +1,60 @@ +from dagger.pipeline.io import IO +from dagger.utilities.config_validator import Attribute + + +class SnsIO(IO): + ref_name = "sns" + + @classmethod + def init_attributes(cls, orig_cls): + cls.add_config_attributes( + [ + Attribute( + attribute_name="account_id", + required=False, + comment="Only needed for cross account dynamo tables" + ), + Attribute( + attribute_name="region", + required=False, + comment="Only needed for cross region dynamo tables" + ), + Attribute( + attribute_name="sns_topic", + comment="The name of the sns topic" + ), + ] + ) + + def __init__(self, io_config, config_location): + super().__init__(io_config, config_location) + + self._account_id = self.parse_attribute("account_id") + self._region = self.parse_attribute("region") + self._sns_topic = self.parse_attribute("sns_topic") + + def alias(self): + return f"dynamo://{self._account_id or ''}/{self._region or ''}/{self._sns_topic}" + + @property + def rendered_name(self): + if not self._account_id and not self._region: + return self._sns_topic + else: + return ":".join([self._account_id or '', self._region or '', self._sns_topic]) + + @property + def airflow_name(self): + return f"dynamo-{'-'.join([name_part for name_part in [self._account_id, self._region, self._sns_topic] if name_part])}" + + @property + def account_id(self): + return self._account_id + + @property + def region(self): + return self._region + + @property + def sns_topic(self): + return self._sns_topic \ No newline at end of file From 4ddfc338e346e11e052c99fe6676f9fa8633b53c Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Fri, 3 Jan 2025 12:25:38 +0100 Subject: [PATCH 114/134] Adding dynamo and sns io types --- dagger/pipeline/io_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dagger/pipeline/io_factory.py b/dagger/pipeline/io_factory.py index 5454f31..61d9fd2 100644 --- a/dagger/pipeline/io_factory.py +++ b/dagger/pipeline/io_factory.py @@ -7,7 +7,9 @@ gdrive_io, redshift_io, s3_io, - databricks_io + databricks_io, + dynamo_io, + sns_io, ) from dagger.utilities.classes import get_deep_obj_subclasses From 8038d3cd494f2b2f5ced1e741e7c920de3aeface Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Fri, 3 Jan 2025 12:27:17 +0100 Subject: [PATCH 115/134] Fixing input/output name for reverse etl so it matches the batch job expected format; Inferring output type (dynamo/sns) based on output type and passing as an argument to the batch job --- .../operator_creators/reverse_etl_creator.py | 2 ++ dagger/pipeline/io.py | 4 +++ dagger/pipeline/tasks/reverse_etl_task.py | 27 +++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py b/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py index be94f74..d81d706 100644 --- a/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py @@ -25,6 +25,7 @@ def __init__(self, task, dag): self._updated_at_column = task.updated_at_column self._from_time = task.from_time self._days_to_live = task.days_to_live + self._output_type = task.output_type def _generate_command(self): command = BatchCreator._generate_command(self) @@ -34,6 +35,7 @@ def _generate_command(self): command.append(f"--primary_id_column={self._primary_id_column}") command.append(f"--model_name={self._model_name}") command.append(f"--project_name={self._project_name}") + command.append(f"--output_type={self._output_type}") if self._assume_role_arn: command.append(f"--assume_role_arn={self._assume_role_arn}") diff --git a/dagger/pipeline/io.py b/dagger/pipeline/io.py index 32ae303..cdacdd0 100644 --- a/dagger/pipeline/io.py +++ b/dagger/pipeline/io.py @@ -63,6 +63,10 @@ def alias(self): def name(self): return self._name + @name.setter + def name(self, value): + self._name = value + @property def has_dependency(self): return self._has_dependency diff --git a/dagger/pipeline/tasks/reverse_etl_task.py b/dagger/pipeline/tasks/reverse_etl_task.py index 47cf555..29e70dd 100644 --- a/dagger/pipeline/tasks/reverse_etl_task.py +++ b/dagger/pipeline/tasks/reverse_etl_task.py @@ -148,6 +148,29 @@ def __init__(self, name, pipeline_name, pipeline, job_config): if not self._from_time: raise ValueError("from_time is required when hash_column or updated_at_column is provided") + # Making sure the input table name is set as it is expected in the reverse etl job + input_index = self._get_io_index(self._inputs) + self._inputs[input_index].name = "input_table_name" + + # Making sure the output name is set as it is expected in the reverse etl job + output_index = self._get_io_index(self._outputs) + self._outputs[output_index].name = "output_name" + + # Extracting the output type from the output definition + self._output_type = self._outputs[output_index].ref_name + if not self._output_type: + raise ValueError("ReverseEtlTask must have an output") + + @staticmethod + def _get_io_index(ios): + if len([io for io in ios if io.ref_name != "dummy"]) > 1: + raise ValueError("ReverseEtlTask can only have one input or output") + + for i, io in enumerate(ios): + if io.ref_name != "dummy": + return i + + @property def assume_role_arn(self): return self._assume_role_arn @@ -199,3 +222,7 @@ def from_time(self): @property def days_to_live(self): return self._days_to_live + + @property + def output_type(self): + return self._output_type From 60c4736efc6a4d17567a09685c9e3481f8f01dbf Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Fri, 3 Jan 2025 12:35:25 +0100 Subject: [PATCH 116/134] Handling hard wired constants as local parameters of the task --- dagger/pipeline/tasks/reverse_etl_task.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/dagger/pipeline/tasks/reverse_etl_task.py b/dagger/pipeline/tasks/reverse_etl_task.py index 29e70dd..3c87a0d 100644 --- a/dagger/pipeline/tasks/reverse_etl_task.py +++ b/dagger/pipeline/tasks/reverse_etl_task.py @@ -4,6 +4,13 @@ class ReverseEtlTask(BatchTask): ref_name = "reverse_etl" + DEFAULT_EXECUTABLE_PREFIX = "python" + DEFAULT_EXECUTABLE = "reverse_etl.py" + DEFAULT_NUM_THREADS = 4 + DEFAULT_BATCH_SIZE = 10000 + DEFAULT_JOB_NAME = "common_batch_jobs/reverse_etl" + DEFAULT_PROJECT_NAME = "feature_store" + @classmethod def init_attributes(cls, orig_cls): cls.add_config_attributes( @@ -123,18 +130,18 @@ def init_attributes(cls, orig_cls): def __init__(self, name, pipeline_name, pipeline, job_config): super().__init__(name, pipeline_name, pipeline, job_config) - self._executable = self.executable or "reverse_etl.py" - self._executable_prefix = self.executable_prefix or "python" + self._executable = self.executable or self.DEFAULT_EXECUTABLE + self._executable_prefix = self.executable_prefix or self.DEFAULT_EXECUTABLE_PREFIX self._assume_role_arn = self.parse_attribute("assume_role_arn") - self._num_threads = self.parse_attribute("num_threads") or 4 - self._batch_size = self.parse_attribute("batch_size") or 10000 - self._absolute_job_name = self._absolute_job_name or "common_batch_jobs/reverse_etl" + self._num_threads = self.parse_attribute("num_threads") or self.DEFAULT_NUM_THREADS + self._batch_size = self.parse_attribute("batch_size") or self.DEFAULT_BATCH_SIZE + self._absolute_job_name = self._absolute_job_name or self.DEFAULT_JOB_NAME self._primary_id_column = self.parse_attribute("primary_id_column") self._secondary_id_column = self.parse_attribute("secondary_id_column") self._custom_id_column = self.parse_attribute("custom_id_column") self._model_name = self.parse_attribute("model_name") - self._project_name = self.parse_attribute("project_name") or "feature_store" + self._project_name = self.parse_attribute("project_name") or self.DEFAULT_PROJECT_NAME self._is_deleted_column = self.parse_attribute("is_deleted_column") self._hash_column = self.parse_attribute("hash_column") self._updated_at_column = self.parse_attribute("updated_at_column") From f64910f749f74773083218aeb47d8b8b15def9a6 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Fri, 3 Jan 2025 16:38:17 +0100 Subject: [PATCH 117/134] Removing account_id from io; naming convention; fixing small issues --- .../operator_creators/reverse_etl_creator.py | 3 +++ dagger/pipeline/ios/dynamo_io.py | 27 +++++-------------- dagger/pipeline/ios/sns_io.py | 22 +++++---------- dagger/pipeline/tasks/reverse_etl_task.py | 25 ++++++++++++----- 4 files changed, 35 insertions(+), 42 deletions(-) diff --git a/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py b/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py index d81d706..e133e40 100644 --- a/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/reverse_etl_creator.py @@ -26,6 +26,7 @@ def __init__(self, task, dag): self._from_time = task.from_time self._days_to_live = task.days_to_live self._output_type = task.output_type + self._region_name = task.region_name def _generate_command(self): command = BatchCreator._generate_command(self) @@ -53,6 +54,8 @@ def _generate_command(self): command.append(f"--from_time={self._from_time}") if self._days_to_live: command.append(f"--days_to_live={self._days_to_live}") + if self._region_name: + command.append(f"--region_name={self._region_name}") return command diff --git a/dagger/pipeline/ios/dynamo_io.py b/dagger/pipeline/ios/dynamo_io.py index c10459c..88d822e 100644 --- a/dagger/pipeline/ios/dynamo_io.py +++ b/dagger/pipeline/ios/dynamo_io.py @@ -10,12 +10,7 @@ def init_attributes(cls, orig_cls): cls.add_config_attributes( [ Attribute( - attribute_name="account_id", - required=False, - comment="Only needed for cross account dynamo tables" - ), - Attribute( - attribute_name="region", + attribute_name="region_name", required=False, comment="Only needed for cross region dynamo tables" ), @@ -29,31 +24,23 @@ def init_attributes(cls, orig_cls): def __init__(self, io_config, config_location): super().__init__(io_config, config_location) - self._account_id = self.parse_attribute("account_id") - self._region = self.parse_attribute("region") + self._region_name = self.parse_attribute("region_name") self._table = self.parse_attribute("table") def alias(self): - return f"dynamo://{self._account_id or ''}/{self._region or ''}/{self._table}" + return f"dynamo://{self._region_name or ''}/{self._table}" @property def rendered_name(self): - if not self._account_id and not self._region: - return self._table - else: - return ":".join([self._account_id or '', self._region or '', self._table]) + return self._table @property def airflow_name(self): - return f"dynamo-{'-'.join([name_part for name_part in [self._account_id, self._region, self._table] if name_part])}" - - @property - def account_id(self): - return self._account_id + return f"dynamo-{'-'.join([name_part for name_part in [self._region_name, self._table] if name_part])}" @property - def region(self): - return self._region + def region_name(self): + return self._region_name @property def table(self): diff --git a/dagger/pipeline/ios/sns_io.py b/dagger/pipeline/ios/sns_io.py index 14b4112..3be660d 100644 --- a/dagger/pipeline/ios/sns_io.py +++ b/dagger/pipeline/ios/sns_io.py @@ -15,7 +15,7 @@ def init_attributes(cls, orig_cls): comment="Only needed for cross account dynamo tables" ), Attribute( - attribute_name="region", + attribute_name="region_name", required=False, comment="Only needed for cross region dynamo tables" ), @@ -29,31 +29,23 @@ def init_attributes(cls, orig_cls): def __init__(self, io_config, config_location): super().__init__(io_config, config_location) - self._account_id = self.parse_attribute("account_id") - self._region = self.parse_attribute("region") + self._region_name = self.parse_attribute("region_name") self._sns_topic = self.parse_attribute("sns_topic") def alias(self): - return f"dynamo://{self._account_id or ''}/{self._region or ''}/{self._sns_topic}" + return f"dynamo://{self._region_name or ''}/{self._sns_topic}" @property def rendered_name(self): - if not self._account_id and not self._region: - return self._sns_topic - else: - return ":".join([self._account_id or '', self._region or '', self._sns_topic]) + return self._sns_topic @property def airflow_name(self): - return f"dynamo-{'-'.join([name_part for name_part in [self._account_id, self._region, self._sns_topic] if name_part])}" + return f"dynamo-{'-'.join([name_part for name_part in [self._region_name, self._sns_topic] if name_part])}" @property - def account_id(self): - return self._account_id - - @property - def region(self): - return self._region + def region_name(self): + return self._region_name @property def sns_topic(self): diff --git a/dagger/pipeline/tasks/reverse_etl_task.py b/dagger/pipeline/tasks/reverse_etl_task.py index 3c87a0d..b62bba5 100644 --- a/dagger/pipeline/tasks/reverse_etl_task.py +++ b/dagger/pipeline/tasks/reverse_etl_task.py @@ -149,33 +149,40 @@ def __init__(self, name, pipeline_name, pipeline, job_config): self._days_to_live = self.parse_attribute("days_to_live") if self._hash_column and self._updated_at_column: - raise ValueError("hash_column and updated_at_column are mutually exclusive") + raise ValueError(f"ReverseETLTask: {self._name} hash_column and updated_at_column are mutually exclusive") if self._hash_column or self._updated_at_column: if not self._from_time: - raise ValueError("from_time is required when hash_column or updated_at_column is provided") + raise ValueError(f"ReverseETLTask: {self._name} from_time is required when hash_column or updated_at_column is provided") # Making sure the input table name is set as it is expected in the reverse etl job input_index = self._get_io_index(self._inputs) + print('XXX', self._inputs, input_index) + if input_index is None: + raise ValueError(f"ReverseEtlTask: {self._name} must have an input") self._inputs[input_index].name = "input_table_name" # Making sure the output name is set as it is expected in the reverse etl job output_index = self._get_io_index(self._outputs) + if output_index is None: + raise ValueError(f"ReverseEtlTask: {self._name} must have an output") self._outputs[output_index].name = "output_name" # Extracting the output type from the output definition self._output_type = self._outputs[output_index].ref_name - if not self._output_type: - raise ValueError("ReverseEtlTask must have an output") - @staticmethod - def _get_io_index(ios): + # Extracting the outputs region name from the output definition + self._region_name = self._outputs[output_index].region_name + + + def _get_io_index(self, ios): if len([io for io in ios if io.ref_name != "dummy"]) > 1: - raise ValueError("ReverseEtlTask can only have one input or output") + raise ValueError(f"ReverseEtlTask: {self._name} can only have one input or output") for i, io in enumerate(ios): if io.ref_name != "dummy": return i + return None @property @@ -233,3 +240,7 @@ def days_to_live(self): @property def output_type(self): return self._output_type + + @property + def region_name(self): + return self._region_name From 2ea6f7e6ca0d8ce36861fc40545166c5f9ad58cb Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Fri, 3 Jan 2025 17:41:34 +0100 Subject: [PATCH 118/134] Fixing batch job name --- dagger/pipeline/tasks/reverse_etl_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/pipeline/tasks/reverse_etl_task.py b/dagger/pipeline/tasks/reverse_etl_task.py index b62bba5..98094e6 100644 --- a/dagger/pipeline/tasks/reverse_etl_task.py +++ b/dagger/pipeline/tasks/reverse_etl_task.py @@ -8,7 +8,7 @@ class ReverseEtlTask(BatchTask): DEFAULT_EXECUTABLE = "reverse_etl.py" DEFAULT_NUM_THREADS = 4 DEFAULT_BATCH_SIZE = 10000 - DEFAULT_JOB_NAME = "common_batch_jobs/reverse_etl" + DEFAULT_JOB_NAME = "common_batch_jobs-reverse_etl" DEFAULT_PROJECT_NAME = "feature_store" @classmethod From 85c9c2aa2b896ccdf8617ea8d2cd59dd58a63480 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Tue, 7 Jan 2025 13:46:30 +0100 Subject: [PATCH 119/134] Removing choco specific parameters and moving them to conf file; Making some arguments default value handled by the job itslef; Making some arguments required instead hardwiring default value --- dagger/conf.py | 6 ++++++ dagger/dagger_config.yaml | 5 +++++ dagger/pipeline/tasks/reverse_etl_task.py | 25 ++++++++--------------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/dagger/conf.py b/dagger/conf.py index cbb075d..df2ab8e 100644 --- a/dagger/conf.py +++ b/dagger/conf.py @@ -104,3 +104,9 @@ plugin_config = config.get('plugin', None) or {} PLUGIN_DIRS = [os.path.join(AIRFLOW_HOME, path) for path in plugin_config.get('paths', [])] logging.info(f"All Python classes will be loaded as plugins from the following directories: {PLUGIN_DIRS}") + +# ReverseETL parameters +reverse_etl_config = config.get('reverse_etl', None) or {} +REVERSE_ETL_DEFAULT_JOB_NAME = reverse_etl_config.get('default_job_name', None) +REVERSE_ETL_DEFAULT_EXECUTABLE_PREFIX = reverse_etl_config.get('default_executable_prefix', None) +REVERSE_ETL_DEFAULT_EXECUTABLE = reverse_etl_config.get('default_executable', None) diff --git a/dagger/dagger_config.yaml b/dagger/dagger_config.yaml index 69c3d54..38abccd 100644 --- a/dagger/dagger_config.yaml +++ b/dagger/dagger_config.yaml @@ -62,3 +62,8 @@ alert: plugin: # paths: # - plugins + +reverse_etl: +# default_job_name: +# default_executable_prefix: +# default_executable: diff --git a/dagger/pipeline/tasks/reverse_etl_task.py b/dagger/pipeline/tasks/reverse_etl_task.py index 98094e6..6c9a5d2 100644 --- a/dagger/pipeline/tasks/reverse_etl_task.py +++ b/dagger/pipeline/tasks/reverse_etl_task.py @@ -1,16 +1,10 @@ from dagger.pipeline.tasks.batch_task import BatchTask from dagger.utilities.config_validator import Attribute +from dagger import conf class ReverseEtlTask(BatchTask): ref_name = "reverse_etl" - DEFAULT_EXECUTABLE_PREFIX = "python" - DEFAULT_EXECUTABLE = "reverse_etl.py" - DEFAULT_NUM_THREADS = 4 - DEFAULT_BATCH_SIZE = 10000 - DEFAULT_JOB_NAME = "common_batch_jobs-reverse_etl" - DEFAULT_PROJECT_NAME = "feature_store" - @classmethod def init_attributes(cls, orig_cls): cls.add_config_attributes( @@ -78,9 +72,8 @@ def init_attributes(cls, orig_cls): attribute_name="project_name", parent_fields=["task_parameters"], validator=str, - required=False, - comment="The name of the project. This is going to be a column on the target table. By default it is" - " set to feature_store", + required=True, + comment="The name of the project. This is going to be a column on the target table.", ), Attribute( attribute_name="is_deleted_column", @@ -130,18 +123,18 @@ def init_attributes(cls, orig_cls): def __init__(self, name, pipeline_name, pipeline, job_config): super().__init__(name, pipeline_name, pipeline, job_config) - self._executable = self.executable or self.DEFAULT_EXECUTABLE - self._executable_prefix = self.executable_prefix or self.DEFAULT_EXECUTABLE_PREFIX + self._executable = self.executable or conf.REVERSE_ETL_DEFAULT_EXECUTABLE + self._executable_prefix = self.executable_prefix or conf.REVERSE_ETL_DEFAULT_EXECUTABLE_PREFIX self._assume_role_arn = self.parse_attribute("assume_role_arn") - self._num_threads = self.parse_attribute("num_threads") or self.DEFAULT_NUM_THREADS - self._batch_size = self.parse_attribute("batch_size") or self.DEFAULT_BATCH_SIZE - self._absolute_job_name = self._absolute_job_name or self.DEFAULT_JOB_NAME + self._num_threads = self.parse_attribute("num_threads") + self._batch_size = self.parse_attribute("batch_size") + self._absolute_job_name = self._absolute_job_name or conf.REVERSE_ETL_DEFAULT_JOB_NAME self._primary_id_column = self.parse_attribute("primary_id_column") self._secondary_id_column = self.parse_attribute("secondary_id_column") self._custom_id_column = self.parse_attribute("custom_id_column") self._model_name = self.parse_attribute("model_name") - self._project_name = self.parse_attribute("project_name") or self.DEFAULT_PROJECT_NAME + self._project_name = self.parse_attribute("project_name") self._is_deleted_column = self.parse_attribute("is_deleted_column") self._hash_column = self.parse_attribute("hash_column") self._updated_at_column = self.parse_attribute("updated_at_column") From 3c5a61072cc17388491286aa278f75841936eb52 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Tue, 7 Jan 2025 15:08:08 +0100 Subject: [PATCH 120/134] Adding unit tests and a small fix --- dagger/pipeline/ios/sns_io.py | 4 ++-- tests/fixtures/pipeline/ios/dynamo_io.yaml | 11 +++++++++++ tests/fixtures/pipeline/ios/sns_io.yaml | 11 +++++++++++ tests/pipeline/ios/test_dynamo_io.py | 19 +++++++++++++++++++ tests/pipeline/ios/test_sns_io.py | 19 +++++++++++++++++++ 5 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 tests/fixtures/pipeline/ios/dynamo_io.yaml create mode 100644 tests/fixtures/pipeline/ios/sns_io.yaml create mode 100644 tests/pipeline/ios/test_dynamo_io.py create mode 100644 tests/pipeline/ios/test_sns_io.py diff --git a/dagger/pipeline/ios/sns_io.py b/dagger/pipeline/ios/sns_io.py index 3be660d..67d17d4 100644 --- a/dagger/pipeline/ios/sns_io.py +++ b/dagger/pipeline/ios/sns_io.py @@ -33,7 +33,7 @@ def __init__(self, io_config, config_location): self._sns_topic = self.parse_attribute("sns_topic") def alias(self): - return f"dynamo://{self._region_name or ''}/{self._sns_topic}" + return f"sns://{self._region_name or ''}/{self._sns_topic}" @property def rendered_name(self): @@ -41,7 +41,7 @@ def rendered_name(self): @property def airflow_name(self): - return f"dynamo-{'-'.join([name_part for name_part in [self._region_name, self._sns_topic] if name_part])}" + return f"sns-{'-'.join([name_part for name_part in [self._region_name, self._sns_topic] if name_part])}" @property def region_name(self): diff --git a/tests/fixtures/pipeline/ios/dynamo_io.yaml b/tests/fixtures/pipeline/ios/dynamo_io.yaml new file mode 100644 index 0000000..d083171 --- /dev/null +++ b/tests/fixtures/pipeline/ios/dynamo_io.yaml @@ -0,0 +1,11 @@ +type: dynamo +name: dynamo_table +table: schema.table_name # The name of the dynamo table +region_name: eu_west_1 + + +# Other attributes: + +# has_dependency: # Weather this i/o should be added to the dependency graph or not. Default is True +# follow_external_dependency: # format: dictionary or boolean | External Task Sensor parameters in key value format: https://airflow.apache.org/docs/apache-airflow/stable/_api/airflow/sensors/base/index.html +# region_name: # Only needed for cross region dynamo tables \ No newline at end of file diff --git a/tests/fixtures/pipeline/ios/sns_io.yaml b/tests/fixtures/pipeline/ios/sns_io.yaml new file mode 100644 index 0000000..542fd8e --- /dev/null +++ b/tests/fixtures/pipeline/ios/sns_io.yaml @@ -0,0 +1,11 @@ +type: sns +name: topic_name +sns_topic: topic_name # The name of the dynamo table +region_name: eu_west_1 + + +# Other attributes: + +# has_dependency: # Weather this i/o should be added to the dependency graph or not. Default is True +# follow_external_dependency: # format: dictionary or boolean | External Task Sensor parameters in key value format: https://airflow.apache.org/docs/apache-airflow/stable/_api/airflow/sensors/base/index.html +# region_name: # Only needed for cross region dynamo tables \ No newline at end of file diff --git a/tests/pipeline/ios/test_dynamo_io.py b/tests/pipeline/ios/test_dynamo_io.py new file mode 100644 index 0000000..0633c8b --- /dev/null +++ b/tests/pipeline/ios/test_dynamo_io.py @@ -0,0 +1,19 @@ +import unittest +from dagger.pipeline.ios.dynamo_io import DynamoIO +import yaml + + +class DynamoIOTest(unittest.TestCase): + def setUp(self) -> None: + with open("tests/fixtures/pipeline/ios/dynamo_io.yaml", "r") as stream: + config = yaml.safe_load(stream) + + self.dynamo_io = DynamoIO(config, "/") + + def test_properties(self): + self.assertEqual(self.dynamo_io.alias(), "dynamo://eu_west_1/schema.table_name") + self.assertEqual(self.dynamo_io.rendered_name, "schema.table_name") + self.assertEqual(self.dynamo_io.airflow_name,"dynamo-eu_west_1-schema.table_name") + +if __name__ == "__main__": + unittest.main() diff --git a/tests/pipeline/ios/test_sns_io.py b/tests/pipeline/ios/test_sns_io.py new file mode 100644 index 0000000..2a5de25 --- /dev/null +++ b/tests/pipeline/ios/test_sns_io.py @@ -0,0 +1,19 @@ +import unittest +from dagger.pipeline.ios.sns_io import SnsIO +import yaml + + +class SnsIOTest(unittest.TestCase): + def setUp(self) -> None: + with open("tests/fixtures/pipeline/ios/sns_io.yaml", "r") as stream: + config = yaml.safe_load(stream) + + self.sns_io = SnsIO(config, "/") + + def test_properties(self): + self.assertEqual(self.sns_io.alias(), f"sns://eu_west_1/topic_name") + self.assertEqual(self.sns_io.rendered_name, "topic_name") + self.assertEqual(self.sns_io.airflow_name, "sns-eu_west_1-topic_name") + +if __name__ == "__main__": + unittest.main() From f45c84a34fb3608e50f8cecf47f3f87173c958e3 Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Tue, 7 Jan 2025 15:10:56 +0100 Subject: [PATCH 121/134] Adding comments --- dagger/utilities/config_validator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dagger/utilities/config_validator.py b/dagger/utilities/config_validator.py index 1f70c1b..e90af68 100644 --- a/dagger/utilities/config_validator.py +++ b/dagger/utilities/config_validator.py @@ -101,6 +101,7 @@ def init_attributes_once(cls, orig_cls: str) -> None: parent_attributes = cls.config_attributes[parent_class.__name__] current_attributes = cls.config_attributes[cls.__name__] + # Overwriting attributes in parent operator if they are also existing in the child operator merged_attributes = {attr.name: attr for attr in parent_attributes} for attr in current_attributes: merged_attributes[attr.name] = attr From e62890b5d729f881afe25ef0087b73910d139525 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Thu, 16 Jan 2025 09:26:36 +0100 Subject: [PATCH 122/134] feat: add application name to the spark job & add kill spark job when timeout --- .../operators/spark_submit_operator.py | 49 ++++++++++++++++++- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index d9df768..af8c451 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -85,10 +85,50 @@ def get_cluster_id_by_name(self, emr_cluster_name, cluster_states): else: return None + + def get_application_id_by_name(self, emr_master_instance_id, application_name): + command = f"yarn application -list -appStates RUNNING | grep {application_name}" + + response = self.ssm_client.send_command( + InstanceIds=[emr_master_instance_id], + DocumentName="AWS-RunShellScript", + Parameters={"commands": [command]} + ) + + command_id = response['Command']['CommandId'] + time.sleep(10) # Wait for the command to execute + + output = self.ssm_client.get_command_invocation( + CommandId=command_id, + InstanceId=emr_master_instance_id + ) + + stdout = output['StandardOutputContent'] + for line in stdout.split('\n'): + if application_name in line: + application_id = line.split()[0] + return application_id + return None + + def kill_spark_job(self, emr_master_instance_id, application_id): + """ + Kill the Spark job using YARN + """ + kill_command = f"yarn application -kill {application_id}" + self.ssm_client.send_command( + InstanceIds=[emr_master_instance_id], + DocumentName="AWS-RunShellScript", + Parameters={"commands": [kill_command]} + ) + raise AirflowException( + f"Spark job exceeded the execution timeout of {self._execution_timeout} seconds and was terminated.") + + def execute(self, context): """ See `execute` method from airflow.operators.bash_operator """ + start_time = time.time() cluster_id = self.get_cluster_id_by_name(self.cluster_name, ["WAITING", "RUNNING"]) emr_master_instance_id = self.emr_client.list_instances(ClusterId=cluster_id, InstanceGroupTypes=["MASTER"], InstanceStates=["RUNNING"])["Instances"][0][ @@ -101,20 +141,25 @@ def execute(self, context): response = self.ssm_client.send_command( InstanceIds=[emr_master_instance_id], DocumentName="AWS-RunShellScript", - Parameters= command_parameters + Parameters=command_parameters ) command_id = response['Command']['CommandId'] status = 'Pending' status_details = None while status in ['Pending', 'InProgress', 'Delayed']: time.sleep(30) + elapsed_time = time.time() - start_time + if self._execution_timeout and elapsed_time > self._execution_timeout: + application_id = self.get_application_id_by_name(emr_master_instance_id, + self.spark_conf_args["application_name"]) + if application_id: + self.kill_spark_job(emr_master_instance_id, application_id) response = self.ssm_client.get_command_invocation(CommandId=command_id, InstanceId=emr_master_instance_id) status = response['Status'] status_details = response['StatusDetails'] self.log.info( self.ssm_client.get_command_invocation(CommandId=command_id, InstanceId=emr_master_instance_id)[ 'StandardErrorContent']) - if status != 'Success': raise AirflowException(f"Spark command failed, check Spark job status in YARN resource manager. " f"Response status details: {status_details}") From 31085acd5aea5a2f18f3094fdc3b72180a6a0907 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Thu, 16 Jan 2025 12:09:24 +0100 Subject: [PATCH 123/134] fix: type of _execution_timeout --- .../dag_creator/airflow/operators/spark_submit_operator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index af8c451..f3ef0a4 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -71,7 +71,6 @@ def get_execution_timeout(self): return None def get_cluster_id_by_name(self, emr_cluster_name, cluster_states): - response = self.emr_client.list_clusters(ClusterStates=cluster_states) matching_clusters = list( filter(lambda cluster: cluster['Name'] == emr_cluster_name, response['Clusters'])) @@ -87,6 +86,9 @@ def get_cluster_id_by_name(self, emr_cluster_name, cluster_states): def get_application_id_by_name(self, emr_master_instance_id, application_name): + """ + Get the application ID of the Spark job + """ command = f"yarn application -list -appStates RUNNING | grep {application_name}" response = self.ssm_client.send_command( @@ -149,7 +151,7 @@ def execute(self, context): while status in ['Pending', 'InProgress', 'Delayed']: time.sleep(30) elapsed_time = time.time() - start_time - if self._execution_timeout and elapsed_time > self._execution_timeout: + if self._execution_timeout and elapsed_time > self._execution_timeout.total_seconds(): application_id = self.get_application_id_by_name(emr_master_instance_id, self.spark_conf_args["application_name"]) if application_id: From 02283d824a110117ecf246c8f6dab72cbdf1301e Mon Sep 17 00:00:00 2001 From: claudiazi Date: Thu, 16 Jan 2025 12:33:59 +0100 Subject: [PATCH 124/134] fix: remove the wrong and uncessary function --- dagger/dag_creator/airflow/operators/spark_submit_operator.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index f3ef0a4..61ab3f4 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -165,7 +165,3 @@ def execute(self, context): if status != 'Success': raise AirflowException(f"Spark command failed, check Spark job status in YARN resource manager. " f"Response status details: {status_details}") - - def on_kill(self): - self.log.info("Sending SIGTERM signal to bash process group") - os.killpg(os.getpgid(self.sp.pid), signal.SIGTERM) From c9a1c6f223c54193add2a13777e217adac4f30c4 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 17 Jan 2025 02:38:43 +0100 Subject: [PATCH 125/134] fix: timeout logic --- .../operators/spark_submit_operator.py | 87 +++++++++++-------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index 61ab3f4..d3156ed 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -125,43 +125,60 @@ def kill_spark_job(self, emr_master_instance_id, application_id): raise AirflowException( f"Spark job exceeded the execution timeout of {self._execution_timeout} seconds and was terminated.") - def execute(self, context): """ See `execute` method from airflow.operators.bash_operator """ start_time = time.time() - cluster_id = self.get_cluster_id_by_name(self.cluster_name, ["WAITING", "RUNNING"]) - emr_master_instance_id = self.emr_client.list_instances(ClusterId=cluster_id, InstanceGroupTypes=["MASTER"], - InstanceStates=["RUNNING"])["Instances"][0][ - "Ec2InstanceId"] - - command_parameters = {"commands": [self.spark_submit_cmd]} - if self._execution_timeout: - command_parameters["executionTimeout"] = [self.get_execution_timeout()] - - response = self.ssm_client.send_command( - InstanceIds=[emr_master_instance_id], - DocumentName="AWS-RunShellScript", - Parameters=command_parameters - ) - command_id = response['Command']['CommandId'] - status = 'Pending' - status_details = None - while status in ['Pending', 'InProgress', 'Delayed']: - time.sleep(30) - elapsed_time = time.time() - start_time - if self._execution_timeout and elapsed_time > self._execution_timeout.total_seconds(): - application_id = self.get_application_id_by_name(emr_master_instance_id, - self.spark_conf_args["application_name"]) - if application_id: - self.kill_spark_job(emr_master_instance_id, application_id) - response = self.ssm_client.get_command_invocation(CommandId=command_id, InstanceId=emr_master_instance_id) - status = response['Status'] - status_details = response['StatusDetails'] - self.log.info( - self.ssm_client.get_command_invocation(CommandId=command_id, InstanceId=emr_master_instance_id)[ - 'StandardErrorContent']) - if status != 'Success': - raise AirflowException(f"Spark command failed, check Spark job status in YARN resource manager. " - f"Response status details: {status_details}") + try: + # Get cluster and master node information + cluster_id = self.get_cluster_id_by_name(self.cluster_name, ["WAITING", "RUNNING"]) + emr_master_instance_id = self.emr_client.list_instances( + ClusterId=cluster_id, InstanceGroupTypes=["MASTER"], InstanceStates=["RUNNING"] + )["Instances"][0]["Ec2InstanceId"] + + # Build the command parameters + command_parameters = {"commands": [self.spark_submit_cmd]} + if self._execution_timeout: + command_parameters["executionTimeout"] = [self.get_execution_timeout()] + + # Send the command via SSM + response = self.ssm_client.send_command( + InstanceIds=[emr_master_instance_id], + DocumentName="AWS-RunShellScript", + Parameters=command_parameters + ) + command_id = response['Command']['CommandId'] + status = 'Pending' + status_details = None + + # Monitor the command's execution + while status in ['Pending', 'InProgress', 'Delayed']: + time.sleep(30) + # Check the status of the SSM command + response = self.ssm_client.get_command_invocation( + CommandId=command_id, InstanceId=emr_master_instance_id + ) + status = response['Status'] + status_details = response['StatusDetails'] + + self.log.info( + self.ssm_client.get_command_invocation( + CommandId=command_id, InstanceId=emr_master_instance_id + )['StandardErrorContent'] + ) + + # Raise an exception if the command did not succeed + if status != 'Success': + raise AirflowException(f"Spark command failed, check Spark job status in YARN resource manager. " + f"Response status details: {status_details}") + + except AirflowTaskTimeout: + # Handle task timeout + self.log.error("Task timed out. Attempting to terminate the Spark job.") + application_id = self.get_application_id_by_name( + emr_master_instance_id, self.spark_conf_args["application_name"] + ) + if application_id: + self.kill_spark_job(emr_master_instance_id, application_id) + raise AirflowException("Task timed out and the Spark job was terminated.") From bfb935c61485555618b3e30d5173c502f39a2655 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 17 Jan 2025 03:07:15 +0100 Subject: [PATCH 126/134] fix: add missing import --- dagger/dag_creator/airflow/operators/spark_submit_operator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index d3156ed..a87e225 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -4,7 +4,7 @@ import time import boto3 -from airflow.exceptions import AirflowException +from airflow.exceptions import AirflowException, AirflowTaskTimeout from airflow.utils.decorators import apply_defaults from dagger.dag_creator.airflow.operators.dagger_base_operator import DaggerBaseOperator From 7fba8d564bed48543e54c4f5ef4cdc5c61e63940 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 17 Jan 2025 04:45:44 +0100 Subject: [PATCH 127/134] fix: spark_app_name --- dagger/dag_creator/airflow/operator_creators/spark_creator.py | 1 + dagger/dag_creator/airflow/operators/spark_submit_operator.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dagger/dag_creator/airflow/operator_creators/spark_creator.py b/dagger/dag_creator/airflow/operator_creators/spark_creator.py index c48ebda..2d1aae8 100644 --- a/dagger/dag_creator/airflow/operator_creators/spark_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/spark_creator.py @@ -91,6 +91,7 @@ def _create_operator(self, **kwargs): job_args=_parse_args(self._template_parameters), spark_args=_parse_spark_args(self._task.spark_args), spark_conf_args=_parse_spark_args(self._task.spark_conf_args, '=', 'conf '), + spark_app_name=self._task.spark_conf_args.get("spark.app.name", ""), extra_py_files=self._task.extra_py_files, **kwargs, ) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index a87e225..6501fcb 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -25,6 +25,7 @@ def __init__( job_args=None, spark_args=None, spark_conf_args=None, + spark_app_name=None, extra_py_files=None, *args, **kwargs, @@ -34,6 +35,7 @@ def __init__( self.job_args = job_args self.spark_args = spark_args self.spark_conf_args = spark_conf_args + self.spark_app_name = spark_app_name self.extra_py_files = extra_py_files self.cluster_name = cluster_name self._execution_timeout = kwargs.get('execution_timeout') @@ -177,7 +179,7 @@ def execute(self, context): # Handle task timeout self.log.error("Task timed out. Attempting to terminate the Spark job.") application_id = self.get_application_id_by_name( - emr_master_instance_id, self.spark_conf_args["application_name"] + emr_master_instance_id, self.spark_app_name ) if application_id: self.kill_spark_job(emr_master_instance_id, application_id) From 37a31f86d205d907031e37968c80d708454462fc Mon Sep 17 00:00:00 2001 From: claudiazi Date: Fri, 17 Jan 2025 10:25:18 +0100 Subject: [PATCH 128/134] fix: default spark_app_name --- .../operator_creators/spark_creator.py | 2 +- .../operators/spark_submit_operator.py | 35 ++++++++++--------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/dagger/dag_creator/airflow/operator_creators/spark_creator.py b/dagger/dag_creator/airflow/operator_creators/spark_creator.py index 2d1aae8..8212a08 100644 --- a/dagger/dag_creator/airflow/operator_creators/spark_creator.py +++ b/dagger/dag_creator/airflow/operator_creators/spark_creator.py @@ -91,7 +91,7 @@ def _create_operator(self, **kwargs): job_args=_parse_args(self._template_parameters), spark_args=_parse_spark_args(self._task.spark_args), spark_conf_args=_parse_spark_args(self._task.spark_conf_args, '=', 'conf '), - spark_app_name=self._task.spark_conf_args.get("spark.app.name", ""), + spark_app_name=self._task.spark_conf_args.get("spark.app.name", None) if self._task.spark_conf_args else None, extra_py_files=self._task.extra_py_files, **kwargs, ) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index 6501fcb..1bb4b19 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -91,27 +91,28 @@ def get_application_id_by_name(self, emr_master_instance_id, application_name): """ Get the application ID of the Spark job """ - command = f"yarn application -list -appStates RUNNING | grep {application_name}" + if application_name: + command = f"yarn application -list -appStates RUNNING | grep {application_name}" - response = self.ssm_client.send_command( - InstanceIds=[emr_master_instance_id], - DocumentName="AWS-RunShellScript", - Parameters={"commands": [command]} - ) + response = self.ssm_client.send_command( + InstanceIds=[emr_master_instance_id], + DocumentName="AWS-RunShellScript", + Parameters={"commands": [command]} + ) - command_id = response['Command']['CommandId'] - time.sleep(10) # Wait for the command to execute + command_id = response['Command']['CommandId'] + time.sleep(10) # Wait for the command to execute - output = self.ssm_client.get_command_invocation( - CommandId=command_id, - InstanceId=emr_master_instance_id - ) + output = self.ssm_client.get_command_invocation( + CommandId=command_id, + InstanceId=emr_master_instance_id + ) - stdout = output['StandardOutputContent'] - for line in stdout.split('\n'): - if application_name in line: - application_id = line.split()[0] - return application_id + stdout = output['StandardOutputContent'] + for line in stdout.split('\n'): + if application_name in line: + application_id = line.split()[0] + return application_id return None def kill_spark_job(self, emr_master_instance_id, application_id): From a3422db71b3b30df61ebf7d5d1fda36d61651ddd Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 20 Jan 2025 08:13:30 +0100 Subject: [PATCH 129/134] feat: improve the logic to kill the spark job --- .../operators/spark_submit_operator.py | 61 ++++++++++--------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index 1bb4b19..a5df9cc 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -1,10 +1,9 @@ import logging import os -import signal import time import boto3 -from airflow.exceptions import AirflowException, AirflowTaskTimeout +from airflow.exceptions import AirflowException from airflow.utils.decorators import apply_defaults from dagger.dag_creator.airflow.operators.dagger_base_operator import DaggerBaseOperator @@ -39,6 +38,8 @@ def __init__( self.extra_py_files = extra_py_files self.cluster_name = cluster_name self._execution_timeout = kwargs.get('execution_timeout') + self._application_id = None + self._emr_master_instance_id = None @property def emr_client(self): @@ -115,18 +116,26 @@ def get_application_id_by_name(self, emr_master_instance_id, application_name): return application_id return None - def kill_spark_job(self, emr_master_instance_id, application_id): - """ - Kill the Spark job using YARN - """ - kill_command = f"yarn application -kill {application_id}" - self.ssm_client.send_command( - InstanceIds=[emr_master_instance_id], - DocumentName="AWS-RunShellScript", - Parameters={"commands": [kill_command]} - ) - raise AirflowException( - f"Spark job exceeded the execution timeout of {self._execution_timeout} seconds and was terminated.") + + def kill_spark_job(self): + if self._application_id and self._emr_master_instance_id: + kill_command = f"yarn application -kill {self._application_id}" + self.ssm_client.send_command( + InstanceIds=[self._emr_master_instance_id], + DocumentName="AWS-RunShellScript", + Parameters={"commands": [kill_command]}, + ) + logging.info( + f"Spark job {self._application_id} terminated successfully." + ) + else: + logging.warning("No application ID or master instance ID found to terminate.") + + + def on_kill(self): + logging.info("Task killed. Attempting to terminate the Spark job.") + self.kill_spark_job() + def execute(self, context): """ @@ -136,7 +145,7 @@ def execute(self, context): try: # Get cluster and master node information cluster_id = self.get_cluster_id_by_name(self.cluster_name, ["WAITING", "RUNNING"]) - emr_master_instance_id = self.emr_client.list_instances( + self._emr_master_instance_id = self.emr_client.list_instances( ClusterId=cluster_id, InstanceGroupTypes=["MASTER"], InstanceStates=["RUNNING"] )["Instances"][0]["Ec2InstanceId"] @@ -147,7 +156,7 @@ def execute(self, context): # Send the command via SSM response = self.ssm_client.send_command( - InstanceIds=[emr_master_instance_id], + InstanceIds=[self._emr_master_instance_id], DocumentName="AWS-RunShellScript", Parameters=command_parameters ) @@ -160,28 +169,24 @@ def execute(self, context): time.sleep(30) # Check the status of the SSM command response = self.ssm_client.get_command_invocation( - CommandId=command_id, InstanceId=emr_master_instance_id + CommandId=command_id, InstanceId=self._emr_master_instance_id ) status = response['Status'] status_details = response['StatusDetails'] self.log.info( self.ssm_client.get_command_invocation( - CommandId=command_id, InstanceId=emr_master_instance_id + CommandId=command_id, InstanceId=self._emr_master_instance_id )['StandardErrorContent'] ) - # Raise an exception if the command did not succeed + # Kill the command and raise an exception if the command did not succeed if status != 'Success': + self.kill_spark_job() raise AirflowException(f"Spark command failed, check Spark job status in YARN resource manager. " f"Response status details: {status_details}") - except AirflowTaskTimeout: - # Handle task timeout - self.log.error("Task timed out. Attempting to terminate the Spark job.") - application_id = self.get_application_id_by_name( - emr_master_instance_id, self.spark_app_name - ) - if application_id: - self.kill_spark_job(emr_master_instance_id, application_id) - raise AirflowException("Task timed out and the Spark job was terminated.") + except Exception as e: + logging.error(f"Error encountered: {str(e)}") + self.kill_spark_job() + raise AirflowException(f"Task failed with error: {str(e)}") From 6046a01243d82010e50cbf9d657131d7dec12082 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 20 Jan 2025 08:56:54 +0100 Subject: [PATCH 130/134] chore: black + add log + missing function --- .../operators/spark_submit_operator.py | 102 ++++++++++-------- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index a5df9cc..d94b9b5 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -18,16 +18,16 @@ class SparkSubmitOperator(DaggerBaseOperator): @apply_defaults def __init__( - self, - job_file, - cluster_name, - job_args=None, - spark_args=None, - spark_conf_args=None, - spark_app_name=None, - extra_py_files=None, - *args, - **kwargs, + self, + job_file, + cluster_name, + job_args=None, + spark_args=None, + spark_conf_args=None, + spark_app_name=None, + extra_py_files=None, + *args, + **kwargs, ): super().__init__(*args, **kwargs) self.job_file = job_file @@ -37,7 +37,7 @@ def __init__( self.spark_app_name = spark_app_name self.extra_py_files = extra_py_files self.cluster_name = cluster_name - self._execution_timeout = kwargs.get('execution_timeout') + self._execution_timeout = kwargs.get("execution_timeout") self._application_id = None self._emr_master_instance_id = None @@ -76,47 +76,54 @@ def get_execution_timeout(self): def get_cluster_id_by_name(self, emr_cluster_name, cluster_states): response = self.emr_client.list_clusters(ClusterStates=cluster_states) matching_clusters = list( - filter(lambda cluster: cluster['Name'] == emr_cluster_name, response['Clusters'])) + filter( + lambda cluster: cluster["Name"] == emr_cluster_name, + response["Clusters"], + ) + ) if len(matching_clusters) == 1: - cluster_id = matching_clusters[0]['Id'] - logging.info('Found cluster name = %s id = %s' % (emr_cluster_name, cluster_id)) + cluster_id = matching_clusters[0]["Id"] + logging.info( + "Found cluster name = %s id = %s" % (emr_cluster_name, cluster_id) + ) return cluster_id elif len(matching_clusters) > 1: - raise AirflowException('More than one cluster found for name = %s' % emr_cluster_name) + raise AirflowException( + "More than one cluster found for name = %s" % emr_cluster_name + ) else: return None - def get_application_id_by_name(self, emr_master_instance_id, application_name): """ Get the application ID of the Spark job """ if application_name: - command = f"yarn application -list -appStates RUNNING | grep {application_name}" + command = ( + f"yarn application -list -appStates RUNNING | grep {application_name}" + ) response = self.ssm_client.send_command( InstanceIds=[emr_master_instance_id], DocumentName="AWS-RunShellScript", - Parameters={"commands": [command]} + Parameters={"commands": [command]}, ) - command_id = response['Command']['CommandId'] + command_id = response["Command"]["CommandId"] time.sleep(10) # Wait for the command to execute output = self.ssm_client.get_command_invocation( - CommandId=command_id, - InstanceId=emr_master_instance_id + CommandId=command_id, InstanceId=emr_master_instance_id ) - stdout = output['StandardOutputContent'] - for line in stdout.split('\n'): + stdout = output["StandardOutputContent"] + for line in stdout.split("\n"): if application_name in line: application_id = line.split()[0] return application_id return None - def kill_spark_job(self): if self._application_id and self._emr_master_instance_id: kill_command = f"yarn application -kill {self._application_id}" @@ -125,28 +132,29 @@ def kill_spark_job(self): DocumentName="AWS-RunShellScript", Parameters={"commands": [kill_command]}, ) - logging.info( - f"Spark job {self._application_id} terminated successfully." - ) + logging.info(f"Spark job {self._application_id} terminated successfully.") else: - logging.warning("No application ID or master instance ID found to terminate.") - + logging.warning( + "No application ID or master instance ID found to terminate." + ) def on_kill(self): logging.info("Task killed. Attempting to terminate the Spark job.") self.kill_spark_job() - def execute(self, context): """ See `execute` method from airflow.operators.bash_operator """ - start_time = time.time() try: # Get cluster and master node information - cluster_id = self.get_cluster_id_by_name(self.cluster_name, ["WAITING", "RUNNING"]) + cluster_id = self.get_cluster_id_by_name( + self.cluster_name, ["WAITING", "RUNNING"] + ) self._emr_master_instance_id = self.emr_client.list_instances( - ClusterId=cluster_id, InstanceGroupTypes=["MASTER"], InstanceStates=["RUNNING"] + ClusterId=cluster_id, + InstanceGroupTypes=["MASTER"], + InstanceStates=["RUNNING"], )["Instances"][0]["Ec2InstanceId"] # Build the command parameters @@ -158,33 +166,41 @@ def execute(self, context): response = self.ssm_client.send_command( InstanceIds=[self._emr_master_instance_id], DocumentName="AWS-RunShellScript", - Parameters=command_parameters + Parameters=command_parameters, ) - command_id = response['Command']['CommandId'] - status = 'Pending' + command_id = response["Command"]["CommandId"] + status = "Pending" status_details = None + self._application_id = self.get_application_id_by_name( + self._emr_master_instance_id, self.spark_app_name + ) + self.log.info( + f"emr:{self._emr_master_instance_id}, application_name:{self.spark_app_name}, application_id: {self._application_id}" + ) # Monitor the command's execution - while status in ['Pending', 'InProgress', 'Delayed']: + while status in ["Pending", "InProgress", "Delayed"]: time.sleep(30) # Check the status of the SSM command response = self.ssm_client.get_command_invocation( CommandId=command_id, InstanceId=self._emr_master_instance_id ) - status = response['Status'] - status_details = response['StatusDetails'] + status = response["Status"] + status_details = response["StatusDetails"] self.log.info( self.ssm_client.get_command_invocation( CommandId=command_id, InstanceId=self._emr_master_instance_id - )['StandardErrorContent'] + )["StandardErrorContent"] ) # Kill the command and raise an exception if the command did not succeed - if status != 'Success': + if status != "Success": self.kill_spark_job() - raise AirflowException(f"Spark command failed, check Spark job status in YARN resource manager. " - f"Response status details: {status_details}") + raise AirflowException( + f"Spark command failed, check Spark job status in YARN resource manager. " + f"Response status details: {status_details}" + ) except Exception as e: logging.error(f"Error encountered: {str(e)}") From 57c394b6db3a47d3cec71a84fb7f6b18f7db1c9f Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 20 Jan 2025 09:46:31 +0100 Subject: [PATCH 131/134] chore: add info to debug --- .../operators/spark_submit_operator.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index d94b9b5..fd26f12 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -117,13 +117,46 @@ def get_application_id_by_name(self, emr_master_instance_id, application_name): CommandId=command_id, InstanceId=emr_master_instance_id ) + self.log.info(f"ouotput: {output}") + stdout = output["StandardOutputContent"] + self.log.info(f"stdout: {stdout}") for line in stdout.split("\n"): if application_name in line: application_id = line.split()[0] return application_id return None + def get_application_id_by_name(self, emr_master_instance_id, application_name): + """ + Get the application ID of the Spark job + """ + if application_name: + command = f"yarn application -list -appStates RUNNING | grep {application_name}" + + response = self.ssm_client.send_command( + InstanceIds=[emr_master_instance_id], + DocumentName="AWS-RunShellScript", + Parameters={"commands": [command]} + ) + + command_id = response['Command']['CommandId'] + time.sleep(10) # Wait for the command to execute + + output = self.ssm_client.get_command_invocation( + CommandId=command_id, + InstanceId=emr_master_instance_id + ) + + stdout = output['StandardOutputContent'] + for line in stdout.split('\n'): + if application_name in line: + application_id = line.split()[0] + return application_id + return None + + + def kill_spark_job(self): if self._application_id and self._emr_master_instance_id: kill_command = f"yarn application -kill {self._application_id}" @@ -203,6 +236,12 @@ def execute(self, context): ) except Exception as e: + self._application_id = self.get_application_id_by_name( + self._emr_master_instance_id, self.spark_app_name + ) + self.log.info( + f"emr:{self._emr_master_instance_id}, application_name:{self.spark_app_name}, application_id: {self._application_id}" + ) logging.error(f"Error encountered: {str(e)}") self.kill_spark_job() raise AirflowException(f"Task failed with error: {str(e)}") From 6fdb6740b5c338c4d48dbf10abc2bfcdad925432 Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 20 Jan 2025 10:05:32 +0100 Subject: [PATCH 132/134] chore: add info to debug --- .../dag_creator/airflow/operators/spark_submit_operator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index fd26f12..aff9bfb 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -156,8 +156,13 @@ def get_application_id_by_name(self, emr_master_instance_id, application_name): return None - def kill_spark_job(self): + self._application_id = self.get_application_id_by_name( + self._emr_master_instance_id, self.spark_app_name + ) + self.log.info( + f"emr:{self._emr_master_instance_id}, application_name:{self.spark_app_name}, application_id: {self._application_id}" + ) if self._application_id and self._emr_master_instance_id: kill_command = f"yarn application -kill {self._application_id}" self.ssm_client.send_command( From a5d627cc4e4ab8e0cab2356ebc81ff93859646fd Mon Sep 17 00:00:00 2001 From: claudiazi Date: Mon, 20 Jan 2025 10:53:11 +0100 Subject: [PATCH 133/134] chore: reformat --- .../operators/spark_submit_operator.py | 47 ------------------- 1 file changed, 47 deletions(-) diff --git a/dagger/dag_creator/airflow/operators/spark_submit_operator.py b/dagger/dag_creator/airflow/operators/spark_submit_operator.py index aff9bfb..31f6a70 100644 --- a/dagger/dag_creator/airflow/operators/spark_submit_operator.py +++ b/dagger/dag_creator/airflow/operators/spark_submit_operator.py @@ -117,52 +117,17 @@ def get_application_id_by_name(self, emr_master_instance_id, application_name): CommandId=command_id, InstanceId=emr_master_instance_id ) - self.log.info(f"ouotput: {output}") - stdout = output["StandardOutputContent"] - self.log.info(f"stdout: {stdout}") for line in stdout.split("\n"): if application_name in line: application_id = line.split()[0] return application_id return None - def get_application_id_by_name(self, emr_master_instance_id, application_name): - """ - Get the application ID of the Spark job - """ - if application_name: - command = f"yarn application -list -appStates RUNNING | grep {application_name}" - - response = self.ssm_client.send_command( - InstanceIds=[emr_master_instance_id], - DocumentName="AWS-RunShellScript", - Parameters={"commands": [command]} - ) - - command_id = response['Command']['CommandId'] - time.sleep(10) # Wait for the command to execute - - output = self.ssm_client.get_command_invocation( - CommandId=command_id, - InstanceId=emr_master_instance_id - ) - - stdout = output['StandardOutputContent'] - for line in stdout.split('\n'): - if application_name in line: - application_id = line.split()[0] - return application_id - return None - - def kill_spark_job(self): self._application_id = self.get_application_id_by_name( self._emr_master_instance_id, self.spark_app_name ) - self.log.info( - f"emr:{self._emr_master_instance_id}, application_name:{self.spark_app_name}, application_id: {self._application_id}" - ) if self._application_id and self._emr_master_instance_id: kill_command = f"yarn application -kill {self._application_id}" self.ssm_client.send_command( @@ -209,12 +174,6 @@ def execute(self, context): command_id = response["Command"]["CommandId"] status = "Pending" status_details = None - self._application_id = self.get_application_id_by_name( - self._emr_master_instance_id, self.spark_app_name - ) - self.log.info( - f"emr:{self._emr_master_instance_id}, application_name:{self.spark_app_name}, application_id: {self._application_id}" - ) # Monitor the command's execution while status in ["Pending", "InProgress", "Delayed"]: @@ -241,12 +200,6 @@ def execute(self, context): ) except Exception as e: - self._application_id = self.get_application_id_by_name( - self._emr_master_instance_id, self.spark_app_name - ) - self.log.info( - f"emr:{self._emr_master_instance_id}, application_name:{self.spark_app_name}, application_id: {self._application_id}" - ) logging.error(f"Error encountered: {str(e)}") self.kill_spark_job() raise AirflowException(f"Task failed with error: {str(e)}") From 3ef83b3fba580bba026d66a75ea4e57319c7a2ba Mon Sep 17 00:00:00 2001 From: David Siklosi Date: Thu, 13 Feb 2025 13:43:51 +0100 Subject: [PATCH 134/134] Bumping tenacity version --- reqs/base.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/reqs/base.txt b/reqs/base.txt index d9cc38a..279ed4b 100644 --- a/reqs/base.txt +++ b/reqs/base.txt @@ -4,4 +4,4 @@ envyaml==1.10.211231 mergedeep==1.3.4 slack==0.0.2 slackclient==2.9.4 -tenacity==8.2.3 +tenacity~=8.3.0 diff --git a/setup.py b/setup.py index 080a5bb..f8b4b28 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,6 @@ def reqs(*f): packages=find_packages(), tests_require=test_requires, url="https://gitlab.com/goflash1/data/dagger", - version="0.9.0", + version="0.9.1", zip_safe=False, )