From 40a44364d37959036d33145cb0f742a60c55d874 Mon Sep 17 00:00:00 2001 From: jackleary Date: Mon, 3 Mar 2025 08:30:24 +0000 Subject: [PATCH 1/5] NRL-1320 Exit if df is null --- .../account-wide-infrastructure/modules/glue/src/pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py index fb71254c0..78b5b1bb6 100644 --- a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py +++ b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py @@ -89,6 +89,8 @@ def transform(self, dataframe): def load(self, data): """Load transformed data into Parquet format""" self.logger.info(f"Loading data into {self.target_path} as Parquet") + if not data: + return None for name, dataframe in data.items(): name = name.replace("--", "_") dataframe.write.mode("append").partitionBy(*self.partition_cols).parquet( From 5f087e2df88b2b93e05571c92876a9471e4e403b Mon Sep 17 00:00:00 2001 From: jackleary Date: Mon, 3 Mar 2025 08:32:00 +0000 Subject: [PATCH 2/5] NRL-1320 Exit if df is null --- .../account-wide-infrastructure/modules/glue/src/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py index 78b5b1bb6..99f0427e2 100644 --- a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py +++ b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py @@ -89,9 +89,9 @@ def transform(self, dataframe): def load(self, data): """Load transformed data into Parquet format""" self.logger.info(f"Loading data into {self.target_path} as Parquet") - if not data: - return None for name, dataframe in data.items(): + if not dataframe: + return None name = name.replace("--", "_") dataframe.write.mode("append").partitionBy(*self.partition_cols).parquet( f"{self.target_path}{name}" From 27983ae6914e391d415c926573e89dbba02dee6c Mon Sep 17 00:00:00 2001 From: jackleary Date: Mon, 3 Mar 2025 08:33:09 +0000 Subject: [PATCH 3/5] NRL-1320 Exit if df is null --- .../account-wide-infrastructure/modules/glue/src/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py index 99f0427e2..aa4a5daeb 100644 --- a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py +++ b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py @@ -91,7 +91,7 @@ def load(self, data): self.logger.info(f"Loading data into {self.target_path} as Parquet") for name, dataframe in data.items(): if not dataframe: - return None + continue name = name.replace("--", "_") dataframe.write.mode("append").partitionBy(*self.partition_cols).parquet( f"{self.target_path}{name}" From 96a5e287c8cffb067cacb73f75ed570d24f8fb28 Mon Sep 17 00:00:00 2001 From: jackleary Date: Mon, 3 Mar 2025 09:36:25 +0000 Subject: [PATCH 4/5] NRL-1320 Exit if df is null --- .../modules/glue/src/pipeline.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py index aa4a5daeb..b8f96c201 100644 --- a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py +++ b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py @@ -90,12 +90,11 @@ def load(self, data): """Load transformed data into Parquet format""" self.logger.info(f"Loading data into {self.target_path} as Parquet") for name, dataframe in data.items(): - if not dataframe: - continue - name = name.replace("--", "_") - dataframe.write.mode("append").partitionBy(*self.partition_cols).parquet( - f"{self.target_path}{name}" - ) + if dataframe.na.drop().count() > 0: + name = name.replace("--", "_") + dataframe.write.mode("append").partitionBy( + *self.partition_cols + ).parquet(f"{self.target_path}{name}") def trigger_crawler(self): self.glue.start_crawler(Name=f"{self.name_prefix}-log-crawler") From eee2f0a0bad22b9546649208902460afc0ac1575 Mon Sep 17 00:00:00 2001 From: jackleary Date: Thu, 6 Mar 2025 11:26:02 +0000 Subject: [PATCH 5/5] NRL-1320 Log if df is null --- .../account-wide-infrastructure/modules/glue/src/pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py index b8f96c201..5ea40ffb8 100644 --- a/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py +++ b/terraform/account-wide-infrastructure/modules/glue/src/pipeline.py @@ -95,6 +95,8 @@ def load(self, data): dataframe.write.mode("append").partitionBy( *self.partition_cols ).parquet(f"{self.target_path}{name}") + else: + self.logger.info(f"Dataframe {name} is null, skipping") def trigger_crawler(self): self.glue.start_crawler(Name=f"{self.name_prefix}-log-crawler")